001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.math.random;
019    
020    import java.io.BufferedReader;
021    import java.io.File;
022    import java.io.FileReader;
023    import java.io.IOException;
024    import java.io.InputStreamReader;
025    import java.io.Serializable;
026    import java.net.URL;
027    import java.util.ArrayList;
028    import java.util.List;
029    
030    import org.apache.commons.math.MathRuntimeException;
031    import org.apache.commons.math.exception.util.LocalizedFormats;
032    import org.apache.commons.math.stat.descriptive.StatisticalSummary;
033    import org.apache.commons.math.stat.descriptive.SummaryStatistics;
034    import org.apache.commons.math.util.FastMath;
035    
036    /**
037     * Implements <code>EmpiricalDistribution</code> interface.  This implementation
038     * uses what amounts to the
039     * <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html">
040     * Variable Kernel Method</a> with Gaussian smoothing:<p>
041     * <strong>Digesting the input file</strong>
042     * <ol><li>Pass the file once to compute min and max.</li>
043     * <li>Divide the range from min-max into <code>binCount</code> "bins."</li>
044     * <li>Pass the data file again, computing bin counts and univariate
045     *     statistics (mean, std dev.) for each of the bins </li>
046     * <li>Divide the interval (0,1) into subintervals associated with the bins,
047     *     with the length of a bin's subinterval proportional to its count.</li></ol>
048     * <strong>Generating random values from the distribution</strong><ol>
049     * <li>Generate a uniformly distributed value in (0,1) </li>
050     * <li>Select the subinterval to which the value belongs.
051     * <li>Generate a random Gaussian value with mean = mean of the associated
052     *     bin and std dev = std dev of associated bin.</li></ol></p><p>
053     *<strong>USAGE NOTES:</strong><ul>
054     *<li>The <code>binCount</code> is set by default to 1000.  A good rule of thumb
055     *    is to set the bin count to approximately the length of the input file divided
056     *    by 10. </li>
057     *<li>The input file <i>must</i> be a plain text file containing one valid numeric
058     *    entry per line.</li>
059     * </ul></p>
060     *
061     * @version $Revision: 1003886 $ $Date: 2010-10-02 23:04:44 +0200 (sam. 02 oct. 2010) $
062     */
063    public class EmpiricalDistributionImpl implements Serializable, EmpiricalDistribution {
064    
065        /** Serializable version identifier */
066        private static final long serialVersionUID = 5729073523949762654L;
067    
068        /** List of SummaryStatistics objects characterizing the bins */
069        private final List<SummaryStatistics> binStats;
070    
071        /** Sample statistics */
072        private SummaryStatistics sampleStats = null;
073    
074        /** Max loaded value */
075        private double max = Double.NEGATIVE_INFINITY;
076    
077        /** Min loaded value */
078        private double min = Double.POSITIVE_INFINITY;
079    
080        /** Grid size */
081        private double delta = 0d;
082    
083        /** number of bins */
084        private final int binCount;
085    
086        /** is the distribution loaded? */
087        private boolean loaded = false;
088    
089        /** upper bounds of subintervals in (0,1) "belonging" to the bins */
090        private double[] upperBounds = null;
091    
092        /** RandomData instance to use in repeated calls to getNext() */
093        private final RandomData randomData = new RandomDataImpl();
094    
095        /**
096         * Creates a new EmpiricalDistribution with the default bin count.
097         */
098        public EmpiricalDistributionImpl() {
099            binCount = 1000;
100            binStats = new ArrayList<SummaryStatistics>();
101        }
102    
103        /**
104         * Creates a new EmpiricalDistribution  with the specified bin count.
105         *
106         * @param binCount number of bins
107         */
108        public EmpiricalDistributionImpl(int binCount) {
109            this.binCount = binCount;
110            binStats = new ArrayList<SummaryStatistics>();
111        }
112    
113         /**
114         * Computes the empirical distribution from the provided
115         * array of numbers.
116         *
117         * @param in the input data array
118         */
119        public void load(double[] in) {
120            DataAdapter da = new ArrayDataAdapter(in);
121            try {
122                da.computeStats();
123                fillBinStats(in);
124            } catch (IOException e) {
125                throw new MathRuntimeException(e);
126            }
127            loaded = true;
128    
129        }
130    
131        /**
132         * Computes the empirical distribution using data read from a URL.
133         * @param url  url of the input file
134         *
135         * @throws IOException if an IO error occurs
136         */
137        public void load(URL url) throws IOException {
138            BufferedReader in =
139                new BufferedReader(new InputStreamReader(url.openStream()));
140            try {
141                DataAdapter da = new StreamDataAdapter(in);
142                da.computeStats();
143                if (sampleStats.getN() == 0) {
144                    throw MathRuntimeException.createEOFException(LocalizedFormats.URL_CONTAINS_NO_DATA,
145                                                                  url);
146                }
147                in = new BufferedReader(new InputStreamReader(url.openStream()));
148                fillBinStats(in);
149                loaded = true;
150            } finally {
151               try {
152                   in.close();
153               } catch (IOException ex) {
154                   // ignore
155               }
156            }
157        }
158    
159        /**
160         * Computes the empirical distribution from the input file.
161         *
162         * @param file the input file
163         * @throws IOException if an IO error occurs
164         */
165        public void load(File file) throws IOException {
166            BufferedReader in = new BufferedReader(new FileReader(file));
167            try {
168                DataAdapter da = new StreamDataAdapter(in);
169                da.computeStats();
170                in = new BufferedReader(new FileReader(file));
171                fillBinStats(in);
172                loaded = true;
173            } finally {
174                try {
175                    in.close();
176                } catch (IOException ex) {
177                    // ignore
178                }
179            }
180        }
181    
182        /**
183         * Provides methods for computing <code>sampleStats</code> and
184         * <code>beanStats</code> abstracting the source of data.
185         */
186        private abstract class DataAdapter{
187    
188            /**
189             * Compute bin stats.
190             *
191             * @throws IOException  if an error occurs computing bin stats
192             */
193            public abstract void computeBinStats() throws IOException;
194    
195            /**
196             * Compute sample statistics.
197             *
198             * @throws IOException if an error occurs computing sample stats
199             */
200            public abstract void computeStats() throws IOException;
201    
202        }
203    
204        /**
205         * Factory of <code>DataAdapter</code> objects. For every supported source
206         * of data (array of doubles, file, etc.) an instance of the proper object
207         * is returned.
208         */
209        private class DataAdapterFactory{
210            /**
211             * Creates a DataAdapter from a data object
212             *
213             * @param in object providing access to the data
214             * @return DataAdapter instance
215             */
216            public DataAdapter getAdapter(Object in) {
217                if (in instanceof BufferedReader) {
218                    BufferedReader inputStream = (BufferedReader) in;
219                    return new StreamDataAdapter(inputStream);
220                } else if (in instanceof double[]) {
221                    double[] inputArray = (double[]) in;
222                    return new ArrayDataAdapter(inputArray);
223                } else {
224                    throw MathRuntimeException.createIllegalArgumentException(
225                          LocalizedFormats.INPUT_DATA_FROM_UNSUPPORTED_DATASOURCE,
226                          in.getClass().getName(),
227                          BufferedReader.class.getName(), double[].class.getName());
228                }
229            }
230        }
231        /**
232         * <code>DataAdapter</code> for data provided through some input stream
233         */
234        private class StreamDataAdapter extends DataAdapter{
235    
236            /** Input stream providing access to the data */
237            private BufferedReader inputStream;
238    
239            /**
240             * Create a StreamDataAdapter from a BufferedReader
241             *
242             * @param in BufferedReader input stream
243             */
244            public StreamDataAdapter(BufferedReader in){
245                super();
246                inputStream = in;
247            }
248    
249            /** {@inheritDoc} */
250            @Override
251            public void computeBinStats() throws IOException {
252                String str = null;
253                double val = 0.0d;
254                while ((str = inputStream.readLine()) != null) {
255                    val = Double.parseDouble(str);
256                    SummaryStatistics stats = binStats.get(findBin(val));
257                    stats.addValue(val);
258                }
259    
260                inputStream.close();
261                inputStream = null;
262            }
263    
264            /** {@inheritDoc} */
265            @Override
266            public void computeStats() throws IOException {
267                String str = null;
268                double val = 0.0;
269                sampleStats = new SummaryStatistics();
270                while ((str = inputStream.readLine()) != null) {
271                    val = Double.valueOf(str).doubleValue();
272                    sampleStats.addValue(val);
273                }
274                inputStream.close();
275                inputStream = null;
276            }
277        }
278    
279        /**
280         * <code>DataAdapter</code> for data provided as array of doubles.
281         */
282        private class ArrayDataAdapter extends DataAdapter {
283    
284            /** Array of input  data values */
285            private double[] inputArray;
286    
287            /**
288             * Construct an ArrayDataAdapter from a double[] array
289             *
290             * @param in double[] array holding the data
291             */
292            public ArrayDataAdapter(double[] in){
293                super();
294                inputArray = in;
295            }
296    
297            /** {@inheritDoc} */
298            @Override
299            public void computeStats() throws IOException {
300                sampleStats = new SummaryStatistics();
301                for (int i = 0; i < inputArray.length; i++) {
302                    sampleStats.addValue(inputArray[i]);
303                }
304            }
305    
306            /** {@inheritDoc} */
307            @Override
308            public void computeBinStats() throws IOException {
309                for (int i = 0; i < inputArray.length; i++) {
310                    SummaryStatistics stats =
311                        binStats.get(findBin(inputArray[i]));
312                    stats.addValue(inputArray[i]);
313                }
314            }
315        }
316    
317        /**
318         * Fills binStats array (second pass through data file).
319         *
320         * @param in object providing access to the data
321         * @throws IOException  if an IO error occurs
322         */
323        private void fillBinStats(Object in) throws IOException {
324            // Set up grid
325            min = sampleStats.getMin();
326            max = sampleStats.getMax();
327            delta = (max - min)/(Double.valueOf(binCount)).doubleValue();
328    
329            // Initialize binStats ArrayList
330            if (!binStats.isEmpty()) {
331                binStats.clear();
332            }
333            for (int i = 0; i < binCount; i++) {
334                SummaryStatistics stats = new SummaryStatistics();
335                binStats.add(i,stats);
336            }
337    
338            // Filling data in binStats Array
339            DataAdapterFactory aFactory = new DataAdapterFactory();
340            DataAdapter da = aFactory.getAdapter(in);
341            da.computeBinStats();
342    
343            // Assign upperBounds based on bin counts
344            upperBounds = new double[binCount];
345            upperBounds[0] =
346            ((double) binStats.get(0).getN()) / (double) sampleStats.getN();
347            for (int i = 1; i < binCount-1; i++) {
348                upperBounds[i] = upperBounds[i-1] +
349                ((double) binStats.get(i).getN()) / (double) sampleStats.getN();
350            }
351            upperBounds[binCount-1] = 1.0d;
352        }
353    
354        /**
355         * Returns the index of the bin to which the given value belongs
356         *
357         * @param value  the value whose bin we are trying to find
358         * @return the index of the bin containing the value
359         */
360        private int findBin(double value) {
361            return FastMath.min(
362                    FastMath.max((int) FastMath.ceil((value- min) / delta) - 1, 0),
363                    binCount - 1);
364            }
365    
366        /**
367         * Generates a random value from this distribution.
368         *
369         * @return the random value.
370         * @throws IllegalStateException if the distribution has not been loaded
371         */
372        public double getNextValue() throws IllegalStateException {
373    
374            if (!loaded) {
375                throw MathRuntimeException.createIllegalStateException(LocalizedFormats.DISTRIBUTION_NOT_LOADED);
376            }
377    
378            // Start with a uniformly distributed random number in (0,1)
379            double x = FastMath.random();
380    
381            // Use this to select the bin and generate a Gaussian within the bin
382            for (int i = 0; i < binCount; i++) {
383               if (x <= upperBounds[i]) {
384                   SummaryStatistics stats = binStats.get(i);
385                   if (stats.getN() > 0) {
386                       if (stats.getStandardDeviation() > 0) {  // more than one obs
387                            return randomData.nextGaussian
388                                (stats.getMean(),stats.getStandardDeviation());
389                       } else {
390                           return stats.getMean(); // only one obs in bin
391                       }
392                   }
393               }
394            }
395            throw new MathRuntimeException(LocalizedFormats.NO_BIN_SELECTED);
396        }
397    
398        /**
399         * Returns a {@link StatisticalSummary} describing this distribution.
400         * <strong>Preconditions:</strong><ul>
401         * <li>the distribution must be loaded before invoking this method</li></ul>
402         *
403         * @return the sample statistics
404         * @throws IllegalStateException if the distribution has not been loaded
405         */
406        public StatisticalSummary getSampleStats() {
407            return sampleStats;
408        }
409    
410        /**
411         * Returns the number of bins.
412         *
413         * @return the number of bins.
414         */
415        public int getBinCount() {
416            return binCount;
417        }
418    
419        /**
420         * Returns a List of {@link SummaryStatistics} instances containing
421         * statistics describing the values in each of the bins.  The list is
422         * indexed on the bin number.
423         *
424         * @return List of bin statistics.
425         */
426        public List<SummaryStatistics> getBinStats() {
427            return binStats;
428        }
429    
430        /**
431         * <p>Returns a fresh copy of the array of upper bounds for the bins.
432         * Bins are: <br/>
433         * [min,upperBounds[0]],(upperBounds[0],upperBounds[1]],...,
434         *  (upperBounds[binCount-2], upperBounds[binCount-1] = max].</p>
435         *
436         * <p>Note: In versions 1.0-2.0 of commons-math, this method
437         * incorrectly returned the array of probability generator upper
438         * bounds now returned by {@link #getGeneratorUpperBounds()}.</p>
439         *
440         * @return array of bin upper bounds
441         * @since 2.1
442         */
443        public double[] getUpperBounds() {
444            double[] binUpperBounds = new double[binCount];
445            binUpperBounds[0] = min + delta;
446            for (int i = 1; i < binCount - 1; i++) {
447                binUpperBounds[i] = binUpperBounds[i-1] + delta;
448            }
449            binUpperBounds[binCount - 1] = max;
450            return binUpperBounds;
451        }
452    
453        /**
454         * <p>Returns a fresh copy of the array of upper bounds of the subintervals
455         * of [0,1] used in generating data from the empirical distribution.
456         * Subintervals correspond to bins with lengths proportional to bin counts.</p>
457         *
458         * <p>In versions 1.0-2.0 of commons-math, this array was (incorrectly) returned
459         * by {@link #getUpperBounds()}.</p>
460         *
461         * @since 2.1
462         * @return array of upper bounds of subintervals used in data generation
463         */
464        public double[] getGeneratorUpperBounds() {
465            int len = upperBounds.length;
466            double[] out = new double[len];
467            System.arraycopy(upperBounds, 0, out, 0, len);
468            return out;
469        }
470    
471        /**
472         * Property indicating whether or not the distribution has been loaded.
473         *
474         * @return true if the distribution has been loaded
475         */
476        public boolean isLoaded() {
477            return loaded;
478        }
479    }