001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017 package org.apache.commons.math.stat.descriptive;
018
019 import java.io.Serializable;
020 import java.lang.reflect.InvocationTargetException;
021 import java.util.Arrays;
022
023 import org.apache.commons.math.MathRuntimeException;
024 import org.apache.commons.math.exception.util.LocalizedFormats;
025 import org.apache.commons.math.stat.descriptive.moment.GeometricMean;
026 import org.apache.commons.math.stat.descriptive.moment.Kurtosis;
027 import org.apache.commons.math.stat.descriptive.moment.Mean;
028 import org.apache.commons.math.stat.descriptive.moment.Skewness;
029 import org.apache.commons.math.stat.descriptive.moment.Variance;
030 import org.apache.commons.math.stat.descriptive.rank.Max;
031 import org.apache.commons.math.stat.descriptive.rank.Min;
032 import org.apache.commons.math.stat.descriptive.rank.Percentile;
033 import org.apache.commons.math.stat.descriptive.summary.Sum;
034 import org.apache.commons.math.stat.descriptive.summary.SumOfSquares;
035 import org.apache.commons.math.util.ResizableDoubleArray;
036 import org.apache.commons.math.util.FastMath;
037
038
039 /**
040 * Maintains a dataset of values of a single variable and computes descriptive
041 * statistics based on stored data. The {@link #getWindowSize() windowSize}
042 * property sets a limit on the number of values that can be stored in the
043 * dataset. The default value, INFINITE_WINDOW, puts no limit on the size of
044 * the dataset. This value should be used with caution, as the backing store
045 * will grow without bound in this case. For very large datasets,
046 * {@link SummaryStatistics}, which does not store the dataset, should be used
047 * instead of this class. If <code>windowSize</code> is not INFINITE_WINDOW and
048 * more values are added than can be stored in the dataset, new values are
049 * added in a "rolling" manner, with new values replacing the "oldest" values
050 * in the dataset.
051 *
052 * <p>Note: this class is not threadsafe. Use
053 * {@link SynchronizedDescriptiveStatistics} if concurrent access from multiple
054 * threads is required.</p>
055 *
056 * @version $Revision: 1054186 $ $Date: 2011-01-01 03:28:46 +0100 (sam. 01 janv. 2011) $
057 */
058 public class DescriptiveStatistics implements StatisticalSummary, Serializable {
059
060 /**
061 * Represents an infinite window size. When the {@link #getWindowSize()}
062 * returns this value, there is no limit to the number of data values
063 * that can be stored in the dataset.
064 */
065 public static final int INFINITE_WINDOW = -1;
066
067 /** Serialization UID */
068 private static final long serialVersionUID = 4133067267405273064L;
069
070 /** Name of the setQuantile method. */
071 private static final String SET_QUANTILE_METHOD_NAME = "setQuantile";
072
073 /** hold the window size **/
074 protected int windowSize = INFINITE_WINDOW;
075
076 /**
077 * Stored data values
078 */
079 protected ResizableDoubleArray eDA = new ResizableDoubleArray();
080
081 /** Mean statistic implementation - can be reset by setter. */
082 private UnivariateStatistic meanImpl = new Mean();
083
084 /** Geometric mean statistic implementation - can be reset by setter. */
085 private UnivariateStatistic geometricMeanImpl = new GeometricMean();
086
087 /** Kurtosis statistic implementation - can be reset by setter. */
088 private UnivariateStatistic kurtosisImpl = new Kurtosis();
089
090 /** Maximum statistic implementation - can be reset by setter. */
091 private UnivariateStatistic maxImpl = new Max();
092
093 /** Minimum statistic implementation - can be reset by setter. */
094 private UnivariateStatistic minImpl = new Min();
095
096 /** Percentile statistic implementation - can be reset by setter. */
097 private UnivariateStatistic percentileImpl = new Percentile();
098
099 /** Skewness statistic implementation - can be reset by setter. */
100 private UnivariateStatistic skewnessImpl = new Skewness();
101
102 /** Variance statistic implementation - can be reset by setter. */
103 private UnivariateStatistic varianceImpl = new Variance();
104
105 /** Sum of squares statistic implementation - can be reset by setter. */
106 private UnivariateStatistic sumsqImpl = new SumOfSquares();
107
108 /** Sum statistic implementation - can be reset by setter. */
109 private UnivariateStatistic sumImpl = new Sum();
110
111 /**
112 * Construct a DescriptiveStatistics instance with an infinite window
113 */
114 public DescriptiveStatistics() {
115 }
116
117 /**
118 * Construct a DescriptiveStatistics instance with the specified window
119 *
120 * @param window the window size.
121 */
122 public DescriptiveStatistics(int window) {
123 setWindowSize(window);
124 }
125
126 /**
127 * Construct a DescriptiveStatistics instance with an infinite window
128 * and the initial data values in double[] initialDoubleArray.
129 * If initialDoubleArray is null, then this constructor corresponds to
130 * DescriptiveStatistics()
131 *
132 * @param initialDoubleArray the initial double[].
133 */
134 public DescriptiveStatistics(double[] initialDoubleArray) {
135 if (initialDoubleArray != null) {
136 eDA = new ResizableDoubleArray(initialDoubleArray);
137 }
138 }
139
140 /**
141 * Copy constructor. Construct a new DescriptiveStatistics instance that
142 * is a copy of original.
143 *
144 * @param original DescriptiveStatistics instance to copy
145 */
146 public DescriptiveStatistics(DescriptiveStatistics original) {
147 copy(original, this);
148 }
149
150 /**
151 * Adds the value to the dataset. If the dataset is at the maximum size
152 * (i.e., the number of stored elements equals the currently configured
153 * windowSize), the first (oldest) element in the dataset is discarded
154 * to make room for the new value.
155 *
156 * @param v the value to be added
157 */
158 public void addValue(double v) {
159 if (windowSize != INFINITE_WINDOW) {
160 if (getN() == windowSize) {
161 eDA.addElementRolling(v);
162 } else if (getN() < windowSize) {
163 eDA.addElement(v);
164 }
165 } else {
166 eDA.addElement(v);
167 }
168 }
169
170 /**
171 * Removes the most recent value from the dataset.
172 */
173 public void removeMostRecentValue() {
174 eDA.discardMostRecentElements(1);
175 }
176
177 /**
178 * Replaces the most recently stored value with the given value.
179 * There must be at least one element stored to call this method.
180 *
181 * @param v the value to replace the most recent stored value
182 * @return replaced value
183 */
184 public double replaceMostRecentValue(double v) {
185 return eDA.substituteMostRecentElement(v);
186 }
187
188 /**
189 * Returns the <a href="http://www.xycoon.com/arithmetic_mean.htm">
190 * arithmetic mean </a> of the available values
191 * @return The mean or Double.NaN if no values have been added.
192 */
193 public double getMean() {
194 return apply(meanImpl);
195 }
196
197 /**
198 * Returns the <a href="http://www.xycoon.com/geometric_mean.htm">
199 * geometric mean </a> of the available values
200 * @return The geometricMean, Double.NaN if no values have been added,
201 * or if the product of the available values is less than or equal to 0.
202 */
203 public double getGeometricMean() {
204 return apply(geometricMeanImpl);
205 }
206
207 /**
208 * Returns the variance of the available values.
209 * @return The variance, Double.NaN if no values have been added
210 * or 0.0 for a single value set.
211 */
212 public double getVariance() {
213 return apply(varianceImpl);
214 }
215
216 /**
217 * Returns the standard deviation of the available values.
218 * @return The standard deviation, Double.NaN if no values have been added
219 * or 0.0 for a single value set.
220 */
221 public double getStandardDeviation() {
222 double stdDev = Double.NaN;
223 if (getN() > 0) {
224 if (getN() > 1) {
225 stdDev = FastMath.sqrt(getVariance());
226 } else {
227 stdDev = 0.0;
228 }
229 }
230 return stdDev;
231 }
232
233 /**
234 * Returns the skewness of the available values. Skewness is a
235 * measure of the asymmetry of a given distribution.
236 * @return The skewness, Double.NaN if no values have been added
237 * or 0.0 for a value set <=2.
238 */
239 public double getSkewness() {
240 return apply(skewnessImpl);
241 }
242
243 /**
244 * Returns the Kurtosis of the available values. Kurtosis is a
245 * measure of the "peakedness" of a distribution
246 * @return The kurtosis, Double.NaN if no values have been added, or 0.0
247 * for a value set <=3.
248 */
249 public double getKurtosis() {
250 return apply(kurtosisImpl);
251 }
252
253 /**
254 * Returns the maximum of the available values
255 * @return The max or Double.NaN if no values have been added.
256 */
257 public double getMax() {
258 return apply(maxImpl);
259 }
260
261 /**
262 * Returns the minimum of the available values
263 * @return The min or Double.NaN if no values have been added.
264 */
265 public double getMin() {
266 return apply(minImpl);
267 }
268
269 /**
270 * Returns the number of available values
271 * @return The number of available values
272 */
273 public long getN() {
274 return eDA.getNumElements();
275 }
276
277 /**
278 * Returns the sum of the values that have been added to Univariate.
279 * @return The sum or Double.NaN if no values have been added
280 */
281 public double getSum() {
282 return apply(sumImpl);
283 }
284
285 /**
286 * Returns the sum of the squares of the available values.
287 * @return The sum of the squares or Double.NaN if no
288 * values have been added.
289 */
290 public double getSumsq() {
291 return apply(sumsqImpl);
292 }
293
294 /**
295 * Resets all statistics and storage
296 */
297 public void clear() {
298 eDA.clear();
299 }
300
301
302 /**
303 * Returns the maximum number of values that can be stored in the
304 * dataset, or INFINITE_WINDOW (-1) if there is no limit.
305 *
306 * @return The current window size or -1 if its Infinite.
307 */
308 public int getWindowSize() {
309 return windowSize;
310 }
311
312 /**
313 * WindowSize controls the number of values which contribute
314 * to the reported statistics. For example, if
315 * windowSize is set to 3 and the values {1,2,3,4,5}
316 * have been added <strong> in that order</strong>
317 * then the <i>available values</i> are {3,4,5} and all
318 * reported statistics will be based on these values
319 * @param windowSize sets the size of the window.
320 */
321 public void setWindowSize(int windowSize) {
322 if (windowSize < 1) {
323 if (windowSize != INFINITE_WINDOW) {
324 throw MathRuntimeException.createIllegalArgumentException(
325 LocalizedFormats.NOT_POSITIVE_WINDOW_SIZE, windowSize);
326 }
327 }
328
329 this.windowSize = windowSize;
330
331 // We need to check to see if we need to discard elements
332 // from the front of the array. If the windowSize is less than
333 // the current number of elements.
334 if (windowSize != INFINITE_WINDOW && windowSize < eDA.getNumElements()) {
335 eDA.discardFrontElements(eDA.getNumElements() - windowSize);
336 }
337 }
338
339 /**
340 * Returns the current set of values in an array of double primitives.
341 * The order of addition is preserved. The returned array is a fresh
342 * copy of the underlying data -- i.e., it is not a reference to the
343 * stored data.
344 *
345 * @return returns the current set of numbers in the order in which they
346 * were added to this set
347 */
348 public double[] getValues() {
349 return eDA.getElements();
350 }
351
352 /**
353 * Returns the current set of values in an array of double primitives,
354 * sorted in ascending order. The returned array is a fresh
355 * copy of the underlying data -- i.e., it is not a reference to the
356 * stored data.
357 * @return returns the current set of
358 * numbers sorted in ascending order
359 */
360 public double[] getSortedValues() {
361 double[] sort = getValues();
362 Arrays.sort(sort);
363 return sort;
364 }
365
366 /**
367 * Returns the element at the specified index
368 * @param index The Index of the element
369 * @return return the element at the specified index
370 */
371 public double getElement(int index) {
372 return eDA.getElement(index);
373 }
374
375 /**
376 * Returns an estimate for the pth percentile of the stored values.
377 * <p>
378 * The implementation provided here follows the first estimation procedure presented
379 * <a href="http://www.itl.nist.gov/div898/handbook/prc/section2/prc252.htm">here.</a>
380 * </p><p>
381 * <strong>Preconditions</strong>:<ul>
382 * <li><code>0 < p ≤ 100</code> (otherwise an
383 * <code>IllegalArgumentException</code> is thrown)</li>
384 * <li>at least one value must be stored (returns <code>Double.NaN
385 * </code> otherwise)</li>
386 * </ul></p>
387 *
388 * @param p the requested percentile (scaled from 0 - 100)
389 * @return An estimate for the pth percentile of the stored data
390 * @throws IllegalStateException if percentile implementation has been
391 * overridden and the supplied implementation does not support setQuantile
392 * values
393 */
394 public double getPercentile(double p) {
395 if (percentileImpl instanceof Percentile) {
396 ((Percentile) percentileImpl).setQuantile(p);
397 } else {
398 try {
399 percentileImpl.getClass().getMethod(SET_QUANTILE_METHOD_NAME,
400 new Class[] {Double.TYPE}).invoke(percentileImpl,
401 new Object[] {Double.valueOf(p)});
402 } catch (NoSuchMethodException e1) { // Setter guard should prevent
403 throw MathRuntimeException.createIllegalArgumentException(
404 LocalizedFormats.PERCENTILE_IMPLEMENTATION_UNSUPPORTED_METHOD,
405 percentileImpl.getClass().getName(), SET_QUANTILE_METHOD_NAME);
406 } catch (IllegalAccessException e2) {
407 throw MathRuntimeException.createIllegalArgumentException(
408 LocalizedFormats.PERCENTILE_IMPLEMENTATION_CANNOT_ACCESS_METHOD,
409 SET_QUANTILE_METHOD_NAME, percentileImpl.getClass().getName());
410 } catch (InvocationTargetException e3) {
411 throw MathRuntimeException.createIllegalArgumentException(e3.getCause());
412 }
413 }
414 return apply(percentileImpl);
415 }
416
417 /**
418 * Generates a text report displaying univariate statistics from values
419 * that have been added. Each statistic is displayed on a separate
420 * line.
421 *
422 * @return String with line feeds displaying statistics
423 */
424 @Override
425 public String toString() {
426 StringBuilder outBuffer = new StringBuilder();
427 String endl = "\n";
428 outBuffer.append("DescriptiveStatistics:").append(endl);
429 outBuffer.append("n: ").append(getN()).append(endl);
430 outBuffer.append("min: ").append(getMin()).append(endl);
431 outBuffer.append("max: ").append(getMax()).append(endl);
432 outBuffer.append("mean: ").append(getMean()).append(endl);
433 outBuffer.append("std dev: ").append(getStandardDeviation())
434 .append(endl);
435 outBuffer.append("median: ").append(getPercentile(50)).append(endl);
436 outBuffer.append("skewness: ").append(getSkewness()).append(endl);
437 outBuffer.append("kurtosis: ").append(getKurtosis()).append(endl);
438 return outBuffer.toString();
439 }
440
441 /**
442 * Apply the given statistic to the data associated with this set of statistics.
443 * @param stat the statistic to apply
444 * @return the computed value of the statistic.
445 */
446 public double apply(UnivariateStatistic stat) {
447 return stat.evaluate(eDA.getInternalValues(), eDA.start(), eDA.getNumElements());
448 }
449
450 // Implementation getters and setter
451
452 /**
453 * Returns the currently configured mean implementation.
454 *
455 * @return the UnivariateStatistic implementing the mean
456 * @since 1.2
457 */
458 public synchronized UnivariateStatistic getMeanImpl() {
459 return meanImpl;
460 }
461
462 /**
463 * <p>Sets the implementation for the mean.</p>
464 *
465 * @param meanImpl the UnivariateStatistic instance to use
466 * for computing the mean
467 * @since 1.2
468 */
469 public synchronized void setMeanImpl(UnivariateStatistic meanImpl) {
470 this.meanImpl = meanImpl;
471 }
472
473 /**
474 * Returns the currently configured geometric mean implementation.
475 *
476 * @return the UnivariateStatistic implementing the geometric mean
477 * @since 1.2
478 */
479 public synchronized UnivariateStatistic getGeometricMeanImpl() {
480 return geometricMeanImpl;
481 }
482
483 /**
484 * <p>Sets the implementation for the gemoetric mean.</p>
485 *
486 * @param geometricMeanImpl the UnivariateStatistic instance to use
487 * for computing the geometric mean
488 * @since 1.2
489 */
490 public synchronized void setGeometricMeanImpl(
491 UnivariateStatistic geometricMeanImpl) {
492 this.geometricMeanImpl = geometricMeanImpl;
493 }
494
495 /**
496 * Returns the currently configured kurtosis implementation.
497 *
498 * @return the UnivariateStatistic implementing the kurtosis
499 * @since 1.2
500 */
501 public synchronized UnivariateStatistic getKurtosisImpl() {
502 return kurtosisImpl;
503 }
504
505 /**
506 * <p>Sets the implementation for the kurtosis.</p>
507 *
508 * @param kurtosisImpl the UnivariateStatistic instance to use
509 * for computing the kurtosis
510 * @since 1.2
511 */
512 public synchronized void setKurtosisImpl(UnivariateStatistic kurtosisImpl) {
513 this.kurtosisImpl = kurtosisImpl;
514 }
515
516 /**
517 * Returns the currently configured maximum implementation.
518 *
519 * @return the UnivariateStatistic implementing the maximum
520 * @since 1.2
521 */
522 public synchronized UnivariateStatistic getMaxImpl() {
523 return maxImpl;
524 }
525
526 /**
527 * <p>Sets the implementation for the maximum.</p>
528 *
529 * @param maxImpl the UnivariateStatistic instance to use
530 * for computing the maximum
531 * @since 1.2
532 */
533 public synchronized void setMaxImpl(UnivariateStatistic maxImpl) {
534 this.maxImpl = maxImpl;
535 }
536
537 /**
538 * Returns the currently configured minimum implementation.
539 *
540 * @return the UnivariateStatistic implementing the minimum
541 * @since 1.2
542 */
543 public synchronized UnivariateStatistic getMinImpl() {
544 return minImpl;
545 }
546
547 /**
548 * <p>Sets the implementation for the minimum.</p>
549 *
550 * @param minImpl the UnivariateStatistic instance to use
551 * for computing the minimum
552 * @since 1.2
553 */
554 public synchronized void setMinImpl(UnivariateStatistic minImpl) {
555 this.minImpl = minImpl;
556 }
557
558 /**
559 * Returns the currently configured percentile implementation.
560 *
561 * @return the UnivariateStatistic implementing the percentile
562 * @since 1.2
563 */
564 public synchronized UnivariateStatistic getPercentileImpl() {
565 return percentileImpl;
566 }
567
568 /**
569 * Sets the implementation to be used by {@link #getPercentile(double)}.
570 * The supplied <code>UnivariateStatistic</code> must provide a
571 * <code>setQuantile(double)</code> method; otherwise
572 * <code>IllegalArgumentException</code> is thrown.
573 *
574 * @param percentileImpl the percentileImpl to set
575 * @throws IllegalArgumentException if the supplied implementation does not
576 * provide a <code>setQuantile</code> method
577 * @since 1.2
578 */
579 public synchronized void setPercentileImpl(
580 UnivariateStatistic percentileImpl) {
581 try {
582 percentileImpl.getClass().getMethod(SET_QUANTILE_METHOD_NAME,
583 new Class[] {Double.TYPE}).invoke(percentileImpl,
584 new Object[] {Double.valueOf(50.0d)});
585 } catch (NoSuchMethodException e1) {
586 throw MathRuntimeException.createIllegalArgumentException(
587 LocalizedFormats.PERCENTILE_IMPLEMENTATION_UNSUPPORTED_METHOD,
588 percentileImpl.getClass().getName(), SET_QUANTILE_METHOD_NAME);
589 } catch (IllegalAccessException e2) {
590 throw MathRuntimeException.createIllegalArgumentException(
591 LocalizedFormats.PERCENTILE_IMPLEMENTATION_CANNOT_ACCESS_METHOD,
592 SET_QUANTILE_METHOD_NAME, percentileImpl.getClass().getName());
593 } catch (InvocationTargetException e3) {
594 throw MathRuntimeException.createIllegalArgumentException(e3.getCause());
595 }
596 this.percentileImpl = percentileImpl;
597 }
598
599 /**
600 * Returns the currently configured skewness implementation.
601 *
602 * @return the UnivariateStatistic implementing the skewness
603 * @since 1.2
604 */
605 public synchronized UnivariateStatistic getSkewnessImpl() {
606 return skewnessImpl;
607 }
608
609 /**
610 * <p>Sets the implementation for the skewness.</p>
611 *
612 * @param skewnessImpl the UnivariateStatistic instance to use
613 * for computing the skewness
614 * @since 1.2
615 */
616 public synchronized void setSkewnessImpl(
617 UnivariateStatistic skewnessImpl) {
618 this.skewnessImpl = skewnessImpl;
619 }
620
621 /**
622 * Returns the currently configured variance implementation.
623 *
624 * @return the UnivariateStatistic implementing the variance
625 * @since 1.2
626 */
627 public synchronized UnivariateStatistic getVarianceImpl() {
628 return varianceImpl;
629 }
630
631 /**
632 * <p>Sets the implementation for the variance.</p>
633 *
634 * @param varianceImpl the UnivariateStatistic instance to use
635 * for computing the variance
636 * @since 1.2
637 */
638 public synchronized void setVarianceImpl(
639 UnivariateStatistic varianceImpl) {
640 this.varianceImpl = varianceImpl;
641 }
642
643 /**
644 * Returns the currently configured sum of squares implementation.
645 *
646 * @return the UnivariateStatistic implementing the sum of squares
647 * @since 1.2
648 */
649 public synchronized UnivariateStatistic getSumsqImpl() {
650 return sumsqImpl;
651 }
652
653 /**
654 * <p>Sets the implementation for the sum of squares.</p>
655 *
656 * @param sumsqImpl the UnivariateStatistic instance to use
657 * for computing the sum of squares
658 * @since 1.2
659 */
660 public synchronized void setSumsqImpl(UnivariateStatistic sumsqImpl) {
661 this.sumsqImpl = sumsqImpl;
662 }
663
664 /**
665 * Returns the currently configured sum implementation.
666 *
667 * @return the UnivariateStatistic implementing the sum
668 * @since 1.2
669 */
670 public synchronized UnivariateStatistic getSumImpl() {
671 return sumImpl;
672 }
673
674 /**
675 * <p>Sets the implementation for the sum.</p>
676 *
677 * @param sumImpl the UnivariateStatistic instance to use
678 * for computing the sum
679 * @since 1.2
680 */
681 public synchronized void setSumImpl(UnivariateStatistic sumImpl) {
682 this.sumImpl = sumImpl;
683 }
684
685 /**
686 * Returns a copy of this DescriptiveStatistics instance with the same internal state.
687 *
688 * @return a copy of this
689 */
690 public DescriptiveStatistics copy() {
691 DescriptiveStatistics result = new DescriptiveStatistics();
692 copy(this, result);
693 return result;
694 }
695
696 /**
697 * Copies source to dest.
698 * <p>Neither source nor dest can be null.</p>
699 *
700 * @param source DescriptiveStatistics to copy
701 * @param dest DescriptiveStatistics to copy to
702 * @throws NullPointerException if either source or dest is null
703 */
704 public static void copy(DescriptiveStatistics source, DescriptiveStatistics dest) {
705 // Copy data and window size
706 dest.eDA = source.eDA.copy();
707 dest.windowSize = source.windowSize;
708
709 // Copy implementations
710 dest.maxImpl = source.maxImpl.copy();
711 dest.meanImpl = source.meanImpl.copy();
712 dest.minImpl = source.minImpl.copy();
713 dest.sumImpl = source.sumImpl.copy();
714 dest.varianceImpl = source.varianceImpl.copy();
715 dest.sumsqImpl = source.sumsqImpl.copy();
716 dest.geometricMeanImpl = source.geometricMeanImpl.copy();
717 dest.kurtosisImpl = source.kurtosisImpl;
718 dest.skewnessImpl = source.skewnessImpl;
719 dest.percentileImpl = source.percentileImpl;
720 }
721 }