001    package aima.learning.neural;
002    
003    import java.io.BufferedReader;
004    import java.io.InputStreamReader;
005    import java.util.ArrayList;
006    import java.util.Arrays;
007    import java.util.List;
008    
009    import aima.learning.framework.DataSet;
010    import aima.learning.framework.Example;
011    import aima.util.Pair;
012    import aima.util.Util;
013    
014    public abstract class NNDataSet {
015            /*
016             * This class represents a source of examples to the rest of the nn
017             * framework. Assumes only one function approximator works on an instance at
018             * a given point in time
019             */
020            /*
021             * the parsed and preprocessed form of the dataset.
022             */
023            private List<NNExample> dataset;
024            /*
025             * a copy from which examples are drawn.
026             */
027            private List<NNExample> presentlyProcessed = new ArrayList<NNExample>();;
028    
029            /*
030             * list of mean Values for all components of raw data set
031             */
032            private List<Double> means;
033    
034            /*
035             * list of stdev Values for all components of raw data set
036             */
037            private List<Double> stdevs;
038            /*
039             * the normalized data set
040             */
041            protected List<List<Double>> nds;
042    
043            /*
044             * the column numbers of the "target"
045             */
046    
047            protected List<Integer> targetColumnNumbers;
048    
049            /*
050             * population delegated to subclass because only subclass knows which
051             * column(s) is target
052             */
053            public abstract void setTargetColumns();
054    
055            /*
056             * create Example instances from a normalized data "table".
057             */
058            private void createExamples() {
059                    dataset = new ArrayList<NNExample>();
060                    for (List<Double> dataLine : nds) {
061                            List<Double> input = new ArrayList<Double>();
062                            List<Double> target = new ArrayList<Double>();
063                            for (int i = 0; i < dataLine.size(); i++) {
064                                    if (targetColumnNumbers.contains(i)) {
065                                            target.add(dataLine.get(i));
066                                    } else {
067                                            input.add(dataLine.get(i));
068                                    }
069                            }
070                            dataset.add(new NNExample(input, target));
071                    }
072                    refreshDataset();// to populate the preentlyProcessed dataset
073            }
074    
075            /*
076             * create a normalized data "table" from the data in the file. At this
077             * stage, the data is *not* split into input pattern and tragets
078             */
079    
080            public void createNormalizedDataFromFile(String filename) throws Exception {
081    
082                    List<List<Double>> rds = new ArrayList<List<Double>>();
083    
084                    // create raw data set
085                    BufferedReader reader = new BufferedReader(
086                                    new InputStreamReader(this.getClass().getResourceAsStream(
087                                                    "../data/" + filename + ".csv")));
088                    String line;
089                    while ((line = reader.readLine()) != null) {
090                            rds.add(exampleFromString(line, ","));
091                    }
092    
093                    // normalize raw dataset
094                    nds = normalize(rds);
095            }
096    
097            /*
098             * create a normalized data "table" from the DataSet using numerizer. At
099             * this stage, the data is *not* split into input pattern and targets TODO
100             * remove redundancy of recreating the target columns. the numerizer has
101             * already isolated the targets
102             */
103    
104            public void createNormalizedDataFromDataSet(DataSet ds, Numerizer numerizer)
105                            throws Exception {
106    
107                    List<List<Double>> rds = rawExamplesFromDataSet(ds, numerizer);
108                    // normalize raw dataset
109                    nds = normalize(rds);
110            }
111    
112            private List<List<Double>> normalize(List<List<Double>> rds) {
113                    int rawDataLength = rds.get(0).size();
114                    List<List<Double>> nds = new ArrayList<List<Double>>();
115    
116                    means = new ArrayList<Double>();
117                    stdevs = new ArrayList<Double>();
118    
119                    List<List<Double>> normalizedColumns = new ArrayList<List<Double>>();
120                    // clculate means for each coponent of example data
121                    for (int i = 0; i < rawDataLength; i++) {
122                            List<Double> columnValues = new ArrayList<Double>();
123                            for (List<Double> rawDatum : rds) {
124                                    columnValues.add(rawDatum.get(i));
125                            }
126                            double mean = Util.calculateMean(columnValues);
127                            means.add(mean);
128    
129                            double stdev = Util.calculateStDev(columnValues, mean);
130                            stdevs.add(stdev);
131    
132                            normalizedColumns.add(Util.normalizeFromMeanAndStdev(columnValues,
133                                            mean, stdev));
134    
135                    }
136                    // re arrange data from columns
137                    // TODO Assert normalized columns have same size etc
138    
139                    int columnLength = normalizedColumns.get(0).size();
140                    int numberOfColumns = normalizedColumns.size();
141                    for (int i = 0; i < columnLength; i++) {
142                            List<Double> lst = new ArrayList<Double>();
143                            for (int j = 0; j < numberOfColumns; j++) {
144                                    lst.add(normalizedColumns.get(j).get(i));
145                            }
146                            nds.add(lst);
147                    }
148                    return nds;
149            }
150    
151            private List<Double> exampleFromString(String line, String separator) {
152                    // assumes all values for inout and target are doubles
153                    List<Double> rexample = new ArrayList<Double>();
154                    List<String> attributeValues = Arrays.asList(line.split(separator));
155                    for (String valString : attributeValues) {
156                            rexample.add(Double.parseDouble(valString));
157                    }
158                    return rexample;
159            }
160    
161            private List<List<Double>> rawExamplesFromDataSet(DataSet ds,
162                            Numerizer numerizer) {
163                    // assumes all values for inout and target are doubles
164                    List<List<Double>> rds = new ArrayList<List<Double>>();
165                    for (int i = 0; i < ds.size(); i++) {
166                            List<Double> rexample = new ArrayList<Double>();
167                            Example e = ds.getExample(i);
168                            Pair<List<Double>, List<Double>> p = numerizer.numerize(e);
169                            List<Double> attributes = p.getFirst();
170                            for (Double d : attributes) {
171                                    rexample.add(d);
172                            }
173                            List<Double> targets = p.getSecond();
174                            for (Double d : targets) {
175                                    rexample.add(d);
176                            }
177                            rds.add(rexample);
178                    }
179                    return rds;
180            }
181    
182            /*
183             * Gets (and removes) a random example from the 'presentlyProcessed'
184             */
185            public NNExample getExampleAtRandom() {
186    
187                    int i = Util.randomNumberBetween(0, (presentlyProcessed.size() - 1));
188                    return presentlyProcessed.remove(i);
189            }
190    
191            /*
192             * Gets (and removes) a random example from the 'presentlyProcessed'
193             */
194            public NNExample getExample(int index) {
195    
196                    return presentlyProcessed.remove(index);
197            }
198    
199            /*
200             * check if any more examples remain to be processed
201             */
202            public boolean hasMoreExamples() {
203                    return presentlyProcessed.size() > 0;
204            }
205    
206            /*
207             * check how many examples remain to be processed
208             */
209            public int howManyExamplesLeft() {
210                    return presentlyProcessed.size();
211            }
212    
213            /*
214             * refreshes the presentlyProcessed dataset so it can be used for a new
215             * epoch of training.
216             */
217            public void refreshDataset() {
218                    presentlyProcessed = new ArrayList<NNExample>();
219                    for (NNExample e : dataset) {
220                            presentlyProcessed.add(e.copyExample());
221                    }
222            }
223    
224            /*
225             * method called by clients to set up data set and make it ready for
226             * processing
227             */
228            public void createExamplesFromFile(String filename) throws Exception {
229                    createNormalizedDataFromFile(filename);
230                    setTargetColumns();
231                    createExamples();
232    
233            }
234    
235            /*
236             * method called by clients to set up data set and make it ready for
237             * processing
238             */
239            public void createExamplesFromDataSet(DataSet ds, Numerizer numerizer)
240                            throws Exception {
241                    createNormalizedDataFromDataSet(ds, numerizer);
242                    setTargetColumns();
243                    createExamples();
244    
245            }
246    
247            public List<List<Double>> getNormalizedData() {
248                    return nds;
249            }
250    
251            public List<Double> getMeans() {
252                    return means;
253            }
254    
255            public List<Double> getStdevs() {
256                    return stdevs;
257            }
258    
259    }