001 package aima.learning.neural; 002 003 import java.io.BufferedReader; 004 import java.io.InputStreamReader; 005 import java.util.ArrayList; 006 import java.util.Arrays; 007 import java.util.List; 008 009 import aima.learning.framework.DataSet; 010 import aima.learning.framework.Example; 011 import aima.util.Pair; 012 import aima.util.Util; 013 014 public abstract class NNDataSet { 015 /* 016 * This class represents a source of examples to the rest of the nn 017 * framework. Assumes only one function approximator works on an instance at 018 * a given point in time 019 */ 020 /* 021 * the parsed and preprocessed form of the dataset. 022 */ 023 private List<NNExample> dataset; 024 /* 025 * a copy from which examples are drawn. 026 */ 027 private List<NNExample> presentlyProcessed = new ArrayList<NNExample>();; 028 029 /* 030 * list of mean Values for all components of raw data set 031 */ 032 private List<Double> means; 033 034 /* 035 * list of stdev Values for all components of raw data set 036 */ 037 private List<Double> stdevs; 038 /* 039 * the normalized data set 040 */ 041 protected List<List<Double>> nds; 042 043 /* 044 * the column numbers of the "target" 045 */ 046 047 protected List<Integer> targetColumnNumbers; 048 049 /* 050 * population delegated to subclass because only subclass knows which 051 * column(s) is target 052 */ 053 public abstract void setTargetColumns(); 054 055 /* 056 * create Example instances from a normalized data "table". 057 */ 058 private void createExamples() { 059 dataset = new ArrayList<NNExample>(); 060 for (List<Double> dataLine : nds) { 061 List<Double> input = new ArrayList<Double>(); 062 List<Double> target = new ArrayList<Double>(); 063 for (int i = 0; i < dataLine.size(); i++) { 064 if (targetColumnNumbers.contains(i)) { 065 target.add(dataLine.get(i)); 066 } else { 067 input.add(dataLine.get(i)); 068 } 069 } 070 dataset.add(new NNExample(input, target)); 071 } 072 refreshDataset();// to populate the preentlyProcessed dataset 073 } 074 075 /* 076 * create a normalized data "table" from the data in the file. At this 077 * stage, the data is *not* split into input pattern and tragets 078 */ 079 080 public void createNormalizedDataFromFile(String filename) throws Exception { 081 082 List<List<Double>> rds = new ArrayList<List<Double>>(); 083 084 // create raw data set 085 BufferedReader reader = new BufferedReader( 086 new InputStreamReader(this.getClass().getResourceAsStream( 087 "../data/" + filename + ".csv"))); 088 String line; 089 while ((line = reader.readLine()) != null) { 090 rds.add(exampleFromString(line, ",")); 091 } 092 093 // normalize raw dataset 094 nds = normalize(rds); 095 } 096 097 /* 098 * create a normalized data "table" from the DataSet using numerizer. At 099 * this stage, the data is *not* split into input pattern and targets TODO 100 * remove redundancy of recreating the target columns. the numerizer has 101 * already isolated the targets 102 */ 103 104 public void createNormalizedDataFromDataSet(DataSet ds, Numerizer numerizer) 105 throws Exception { 106 107 List<List<Double>> rds = rawExamplesFromDataSet(ds, numerizer); 108 // normalize raw dataset 109 nds = normalize(rds); 110 } 111 112 private List<List<Double>> normalize(List<List<Double>> rds) { 113 int rawDataLength = rds.get(0).size(); 114 List<List<Double>> nds = new ArrayList<List<Double>>(); 115 116 means = new ArrayList<Double>(); 117 stdevs = new ArrayList<Double>(); 118 119 List<List<Double>> normalizedColumns = new ArrayList<List<Double>>(); 120 // clculate means for each coponent of example data 121 for (int i = 0; i < rawDataLength; i++) { 122 List<Double> columnValues = new ArrayList<Double>(); 123 for (List<Double> rawDatum : rds) { 124 columnValues.add(rawDatum.get(i)); 125 } 126 double mean = Util.calculateMean(columnValues); 127 means.add(mean); 128 129 double stdev = Util.calculateStDev(columnValues, mean); 130 stdevs.add(stdev); 131 132 normalizedColumns.add(Util.normalizeFromMeanAndStdev(columnValues, 133 mean, stdev)); 134 135 } 136 // re arrange data from columns 137 // TODO Assert normalized columns have same size etc 138 139 int columnLength = normalizedColumns.get(0).size(); 140 int numberOfColumns = normalizedColumns.size(); 141 for (int i = 0; i < columnLength; i++) { 142 List<Double> lst = new ArrayList<Double>(); 143 for (int j = 0; j < numberOfColumns; j++) { 144 lst.add(normalizedColumns.get(j).get(i)); 145 } 146 nds.add(lst); 147 } 148 return nds; 149 } 150 151 private List<Double> exampleFromString(String line, String separator) { 152 // assumes all values for inout and target are doubles 153 List<Double> rexample = new ArrayList<Double>(); 154 List<String> attributeValues = Arrays.asList(line.split(separator)); 155 for (String valString : attributeValues) { 156 rexample.add(Double.parseDouble(valString)); 157 } 158 return rexample; 159 } 160 161 private List<List<Double>> rawExamplesFromDataSet(DataSet ds, 162 Numerizer numerizer) { 163 // assumes all values for inout and target are doubles 164 List<List<Double>> rds = new ArrayList<List<Double>>(); 165 for (int i = 0; i < ds.size(); i++) { 166 List<Double> rexample = new ArrayList<Double>(); 167 Example e = ds.getExample(i); 168 Pair<List<Double>, List<Double>> p = numerizer.numerize(e); 169 List<Double> attributes = p.getFirst(); 170 for (Double d : attributes) { 171 rexample.add(d); 172 } 173 List<Double> targets = p.getSecond(); 174 for (Double d : targets) { 175 rexample.add(d); 176 } 177 rds.add(rexample); 178 } 179 return rds; 180 } 181 182 /* 183 * Gets (and removes) a random example from the 'presentlyProcessed' 184 */ 185 public NNExample getExampleAtRandom() { 186 187 int i = Util.randomNumberBetween(0, (presentlyProcessed.size() - 1)); 188 return presentlyProcessed.remove(i); 189 } 190 191 /* 192 * Gets (and removes) a random example from the 'presentlyProcessed' 193 */ 194 public NNExample getExample(int index) { 195 196 return presentlyProcessed.remove(index); 197 } 198 199 /* 200 * check if any more examples remain to be processed 201 */ 202 public boolean hasMoreExamples() { 203 return presentlyProcessed.size() > 0; 204 } 205 206 /* 207 * check how many examples remain to be processed 208 */ 209 public int howManyExamplesLeft() { 210 return presentlyProcessed.size(); 211 } 212 213 /* 214 * refreshes the presentlyProcessed dataset so it can be used for a new 215 * epoch of training. 216 */ 217 public void refreshDataset() { 218 presentlyProcessed = new ArrayList<NNExample>(); 219 for (NNExample e : dataset) { 220 presentlyProcessed.add(e.copyExample()); 221 } 222 } 223 224 /* 225 * method called by clients to set up data set and make it ready for 226 * processing 227 */ 228 public void createExamplesFromFile(String filename) throws Exception { 229 createNormalizedDataFromFile(filename); 230 setTargetColumns(); 231 createExamples(); 232 233 } 234 235 /* 236 * method called by clients to set up data set and make it ready for 237 * processing 238 */ 239 public void createExamplesFromDataSet(DataSet ds, Numerizer numerizer) 240 throws Exception { 241 createNormalizedDataFromDataSet(ds, numerizer); 242 setTargetColumns(); 243 createExamples(); 244 245 } 246 247 public List<List<Double>> getNormalizedData() { 248 return nds; 249 } 250 251 public List<Double> getMeans() { 252 return means; 253 } 254 255 public List<Double> getStdevs() { 256 return stdevs; 257 } 258 259 }