001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017 package org.apache.commons.math.stat.descriptive;
018
019 import java.io.Serializable;
020 import java.util.Arrays;
021
022 import org.apache.commons.math.DimensionMismatchException;
023 import org.apache.commons.math.MathRuntimeException;
024 import org.apache.commons.math.exception.util.LocalizedFormats;
025 import org.apache.commons.math.linear.RealMatrix;
026 import org.apache.commons.math.stat.descriptive.moment.GeometricMean;
027 import org.apache.commons.math.stat.descriptive.moment.Mean;
028 import org.apache.commons.math.stat.descriptive.moment.VectorialCovariance;
029 import org.apache.commons.math.stat.descriptive.rank.Max;
030 import org.apache.commons.math.stat.descriptive.rank.Min;
031 import org.apache.commons.math.stat.descriptive.summary.Sum;
032 import org.apache.commons.math.stat.descriptive.summary.SumOfLogs;
033 import org.apache.commons.math.stat.descriptive.summary.SumOfSquares;
034 import org.apache.commons.math.util.MathUtils;
035 import org.apache.commons.math.util.FastMath;
036
037 /**
038 * <p>Computes summary statistics for a stream of n-tuples added using the
039 * {@link #addValue(double[]) addValue} method. The data values are not stored
040 * in memory, so this class can be used to compute statistics for very large
041 * n-tuple streams.</p>
042 *
043 * <p>The {@link StorelessUnivariateStatistic} instances used to maintain
044 * summary state and compute statistics are configurable via setters.
045 * For example, the default implementation for the mean can be overridden by
046 * calling {@link #setMeanImpl(StorelessUnivariateStatistic[])}. Actual
047 * parameters to these methods must implement the
048 * {@link StorelessUnivariateStatistic} interface and configuration must be
049 * completed before <code>addValue</code> is called. No configuration is
050 * necessary to use the default, commons-math provided implementations.</p>
051 *
052 * <p>To compute statistics for a stream of n-tuples, construct a
053 * MultivariateStatistics instance with dimension n and then use
054 * {@link #addValue(double[])} to add n-tuples. The <code>getXxx</code>
055 * methods where Xxx is a statistic return an array of <code>double</code>
056 * values, where for <code>i = 0,...,n-1</code> the i<sup>th</sup> array element is the
057 * value of the given statistic for data range consisting of the i<sup>th</sup> element of
058 * each of the input n-tuples. For example, if <code>addValue</code> is called
059 * with actual parameters {0, 1, 2}, then {3, 4, 5} and finally {6, 7, 8},
060 * <code>getSum</code> will return a three-element array with values
061 * {0+3+6, 1+4+7, 2+5+8}</p>
062 *
063 * <p>Note: This class is not thread-safe. Use
064 * {@link SynchronizedMultivariateSummaryStatistics} if concurrent access from multiple
065 * threads is required.</p>
066 *
067 * @since 1.2
068 * @version $Revision: 1042376 $ $Date: 2010-12-05 16:54:55 +0100 (dim. 05 d??c. 2010) $
069 */
070 public class MultivariateSummaryStatistics
071 implements StatisticalMultivariateSummary, Serializable {
072
073 /** Serialization UID */
074 private static final long serialVersionUID = 2271900808994826718L;
075
076 /** Dimension of the data. */
077 private int k;
078
079 /** Count of values that have been added */
080 private long n = 0;
081
082 /** Sum statistic implementation - can be reset by setter. */
083 private StorelessUnivariateStatistic[] sumImpl;
084
085 /** Sum of squares statistic implementation - can be reset by setter. */
086 private StorelessUnivariateStatistic[] sumSqImpl;
087
088 /** Minimum statistic implementation - can be reset by setter. */
089 private StorelessUnivariateStatistic[] minImpl;
090
091 /** Maximum statistic implementation - can be reset by setter. */
092 private StorelessUnivariateStatistic[] maxImpl;
093
094 /** Sum of log statistic implementation - can be reset by setter. */
095 private StorelessUnivariateStatistic[] sumLogImpl;
096
097 /** Geometric mean statistic implementation - can be reset by setter. */
098 private StorelessUnivariateStatistic[] geoMeanImpl;
099
100 /** Mean statistic implementation - can be reset by setter. */
101 private StorelessUnivariateStatistic[] meanImpl;
102
103 /** Covariance statistic implementation - cannot be reset. */
104 private VectorialCovariance covarianceImpl;
105
106 /**
107 * Construct a MultivariateSummaryStatistics instance
108 * @param k dimension of the data
109 * @param isCovarianceBiasCorrected if true, the unbiased sample
110 * covariance is computed, otherwise the biased population covariance
111 * is computed
112 */
113 public MultivariateSummaryStatistics(int k, boolean isCovarianceBiasCorrected) {
114 this.k = k;
115
116 sumImpl = new StorelessUnivariateStatistic[k];
117 sumSqImpl = new StorelessUnivariateStatistic[k];
118 minImpl = new StorelessUnivariateStatistic[k];
119 maxImpl = new StorelessUnivariateStatistic[k];
120 sumLogImpl = new StorelessUnivariateStatistic[k];
121 geoMeanImpl = new StorelessUnivariateStatistic[k];
122 meanImpl = new StorelessUnivariateStatistic[k];
123
124 for (int i = 0; i < k; ++i) {
125 sumImpl[i] = new Sum();
126 sumSqImpl[i] = new SumOfSquares();
127 minImpl[i] = new Min();
128 maxImpl[i] = new Max();
129 sumLogImpl[i] = new SumOfLogs();
130 geoMeanImpl[i] = new GeometricMean();
131 meanImpl[i] = new Mean();
132 }
133
134 covarianceImpl =
135 new VectorialCovariance(k, isCovarianceBiasCorrected);
136
137 }
138
139 /**
140 * Add an n-tuple to the data
141 *
142 * @param value the n-tuple to add
143 * @throws DimensionMismatchException if the length of the array
144 * does not match the one used at construction
145 */
146 public void addValue(double[] value)
147 throws DimensionMismatchException {
148 checkDimension(value.length);
149 for (int i = 0; i < k; ++i) {
150 double v = value[i];
151 sumImpl[i].increment(v);
152 sumSqImpl[i].increment(v);
153 minImpl[i].increment(v);
154 maxImpl[i].increment(v);
155 sumLogImpl[i].increment(v);
156 geoMeanImpl[i].increment(v);
157 meanImpl[i].increment(v);
158 }
159 covarianceImpl.increment(value);
160 n++;
161 }
162
163 /**
164 * Returns the dimension of the data
165 * @return The dimension of the data
166 */
167 public int getDimension() {
168 return k;
169 }
170
171 /**
172 * Returns the number of available values
173 * @return The number of available values
174 */
175 public long getN() {
176 return n;
177 }
178
179 /**
180 * Returns an array of the results of a statistic.
181 * @param stats univariate statistic array
182 * @return results array
183 */
184 private double[] getResults(StorelessUnivariateStatistic[] stats) {
185 double[] results = new double[stats.length];
186 for (int i = 0; i < results.length; ++i) {
187 results[i] = stats[i].getResult();
188 }
189 return results;
190 }
191
192 /**
193 * Returns an array whose i<sup>th</sup> entry is the sum of the
194 * i<sup>th</sup> entries of the arrays that have been added using
195 * {@link #addValue(double[])}
196 *
197 * @return the array of component sums
198 */
199 public double[] getSum() {
200 return getResults(sumImpl);
201 }
202
203 /**
204 * Returns an array whose i<sup>th</sup> entry is the sum of squares of the
205 * i<sup>th</sup> entries of the arrays that have been added using
206 * {@link #addValue(double[])}
207 *
208 * @return the array of component sums of squares
209 */
210 public double[] getSumSq() {
211 return getResults(sumSqImpl);
212 }
213
214 /**
215 * Returns an array whose i<sup>th</sup> entry is the sum of logs of the
216 * i<sup>th</sup> entries of the arrays that have been added using
217 * {@link #addValue(double[])}
218 *
219 * @return the array of component log sums
220 */
221 public double[] getSumLog() {
222 return getResults(sumLogImpl);
223 }
224
225 /**
226 * Returns an array whose i<sup>th</sup> entry is the mean of the
227 * i<sup>th</sup> entries of the arrays that have been added using
228 * {@link #addValue(double[])}
229 *
230 * @return the array of component means
231 */
232 public double[] getMean() {
233 return getResults(meanImpl);
234 }
235
236 /**
237 * Returns an array whose i<sup>th</sup> entry is the standard deviation of the
238 * i<sup>th</sup> entries of the arrays that have been added using
239 * {@link #addValue(double[])}
240 *
241 * @return the array of component standard deviations
242 */
243 public double[] getStandardDeviation() {
244 double[] stdDev = new double[k];
245 if (getN() < 1) {
246 Arrays.fill(stdDev, Double.NaN);
247 } else if (getN() < 2) {
248 Arrays.fill(stdDev, 0.0);
249 } else {
250 RealMatrix matrix = covarianceImpl.getResult();
251 for (int i = 0; i < k; ++i) {
252 stdDev[i] = FastMath.sqrt(matrix.getEntry(i, i));
253 }
254 }
255 return stdDev;
256 }
257
258 /**
259 * Returns the covariance matrix of the values that have been added.
260 *
261 * @return the covariance matrix
262 */
263 public RealMatrix getCovariance() {
264 return covarianceImpl.getResult();
265 }
266
267 /**
268 * Returns an array whose i<sup>th</sup> entry is the maximum of the
269 * i<sup>th</sup> entries of the arrays that have been added using
270 * {@link #addValue(double[])}
271 *
272 * @return the array of component maxima
273 */
274 public double[] getMax() {
275 return getResults(maxImpl);
276 }
277
278 /**
279 * Returns an array whose i<sup>th</sup> entry is the minimum of the
280 * i<sup>th</sup> entries of the arrays that have been added using
281 * {@link #addValue(double[])}
282 *
283 * @return the array of component minima
284 */
285 public double[] getMin() {
286 return getResults(minImpl);
287 }
288
289 /**
290 * Returns an array whose i<sup>th</sup> entry is the geometric mean of the
291 * i<sup>th</sup> entries of the arrays that have been added using
292 * {@link #addValue(double[])}
293 *
294 * @return the array of component geometric means
295 */
296 public double[] getGeometricMean() {
297 return getResults(geoMeanImpl);
298 }
299
300 /**
301 * Generates a text report displaying
302 * summary statistics from values that
303 * have been added.
304 * @return String with line feeds displaying statistics
305 */
306 @Override
307 public String toString() {
308 final String separator = ", ";
309 final String suffix = System.getProperty("line.separator");
310 StringBuilder outBuffer = new StringBuilder();
311 outBuffer.append("MultivariateSummaryStatistics:" + suffix);
312 outBuffer.append("n: " + getN() + suffix);
313 append(outBuffer, getMin(), "min: ", separator, suffix);
314 append(outBuffer, getMax(), "max: ", separator, suffix);
315 append(outBuffer, getMean(), "mean: ", separator, suffix);
316 append(outBuffer, getGeometricMean(), "geometric mean: ", separator, suffix);
317 append(outBuffer, getSumSq(), "sum of squares: ", separator, suffix);
318 append(outBuffer, getSumLog(), "sum of logarithms: ", separator, suffix);
319 append(outBuffer, getStandardDeviation(), "standard deviation: ", separator, suffix);
320 outBuffer.append("covariance: " + getCovariance().toString() + suffix);
321 return outBuffer.toString();
322 }
323
324 /**
325 * Append a text representation of an array to a buffer.
326 * @param buffer buffer to fill
327 * @param data data array
328 * @param prefix text prefix
329 * @param separator elements separator
330 * @param suffix text suffix
331 */
332 private void append(StringBuilder buffer, double[] data,
333 String prefix, String separator, String suffix) {
334 buffer.append(prefix);
335 for (int i = 0; i < data.length; ++i) {
336 if (i > 0) {
337 buffer.append(separator);
338 }
339 buffer.append(data[i]);
340 }
341 buffer.append(suffix);
342 }
343
344 /**
345 * Resets all statistics and storage
346 */
347 public void clear() {
348 this.n = 0;
349 for (int i = 0; i < k; ++i) {
350 minImpl[i].clear();
351 maxImpl[i].clear();
352 sumImpl[i].clear();
353 sumLogImpl[i].clear();
354 sumSqImpl[i].clear();
355 geoMeanImpl[i].clear();
356 meanImpl[i].clear();
357 }
358 covarianceImpl.clear();
359 }
360
361 /**
362 * Returns true iff <code>object</code> is a <code>MultivariateSummaryStatistics</code>
363 * instance and all statistics have the same values as this.
364 * @param object the object to test equality against.
365 * @return true if object equals this
366 */
367 @Override
368 public boolean equals(Object object) {
369 if (object == this ) {
370 return true;
371 }
372 if (object instanceof MultivariateSummaryStatistics == false) {
373 return false;
374 }
375 MultivariateSummaryStatistics stat = (MultivariateSummaryStatistics) object;
376 return MathUtils.equalsIncludingNaN(stat.getGeometricMean(), getGeometricMean()) &&
377 MathUtils.equalsIncludingNaN(stat.getMax(), getMax()) &&
378 MathUtils.equalsIncludingNaN(stat.getMean(), getMean()) &&
379 MathUtils.equalsIncludingNaN(stat.getMin(), getMin()) &&
380 MathUtils.equalsIncludingNaN(stat.getN(), getN()) &&
381 MathUtils.equalsIncludingNaN(stat.getSum(), getSum()) &&
382 MathUtils.equalsIncludingNaN(stat.getSumSq(), getSumSq()) &&
383 MathUtils.equalsIncludingNaN(stat.getSumLog(), getSumLog()) &&
384 stat.getCovariance().equals( getCovariance());
385 }
386
387 /**
388 * Returns hash code based on values of statistics
389 *
390 * @return hash code
391 */
392 @Override
393 public int hashCode() {
394 int result = 31 + MathUtils.hash(getGeometricMean());
395 result = result * 31 + MathUtils.hash(getGeometricMean());
396 result = result * 31 + MathUtils.hash(getMax());
397 result = result * 31 + MathUtils.hash(getMean());
398 result = result * 31 + MathUtils.hash(getMin());
399 result = result * 31 + MathUtils.hash(getN());
400 result = result * 31 + MathUtils.hash(getSum());
401 result = result * 31 + MathUtils.hash(getSumSq());
402 result = result * 31 + MathUtils.hash(getSumLog());
403 result = result * 31 + getCovariance().hashCode();
404 return result;
405 }
406
407 // Getters and setters for statistics implementations
408 /**
409 * Sets statistics implementations.
410 * @param newImpl new implementations for statistics
411 * @param oldImpl old implementations for statistics
412 * @throws DimensionMismatchException if the array dimension
413 * does not match the one used at construction
414 * @throws IllegalStateException if data has already been added
415 * (i.e if n > 0)
416 */
417 private void setImpl(StorelessUnivariateStatistic[] newImpl,
418 StorelessUnivariateStatistic[] oldImpl)
419 throws DimensionMismatchException, IllegalStateException {
420 checkEmpty();
421 checkDimension(newImpl.length);
422 System.arraycopy(newImpl, 0, oldImpl, 0, newImpl.length);
423 }
424
425 /**
426 * Returns the currently configured Sum implementation
427 *
428 * @return the StorelessUnivariateStatistic implementing the sum
429 */
430 public StorelessUnivariateStatistic[] getSumImpl() {
431 return sumImpl.clone();
432 }
433
434 /**
435 * <p>Sets the implementation for the Sum.</p>
436 * <p>This method must be activated before any data has been added - i.e.,
437 * before {@link #addValue(double[]) addValue} has been used to add data;
438 * otherwise an IllegalStateException will be thrown.</p>
439 *
440 * @param sumImpl the StorelessUnivariateStatistic instance to use
441 * for computing the Sum
442 * @throws DimensionMismatchException if the array dimension
443 * does not match the one used at construction
444 * @throws IllegalStateException if data has already been added
445 * (i.e if n > 0)
446 */
447 public void setSumImpl(StorelessUnivariateStatistic[] sumImpl)
448 throws DimensionMismatchException {
449 setImpl(sumImpl, this.sumImpl);
450 }
451
452 /**
453 * Returns the currently configured sum of squares implementation
454 *
455 * @return the StorelessUnivariateStatistic implementing the sum of squares
456 */
457 public StorelessUnivariateStatistic[] getSumsqImpl() {
458 return sumSqImpl.clone();
459 }
460
461 /**
462 * <p>Sets the implementation for the sum of squares.</p>
463 * <p>This method must be activated before any data has been added - i.e.,
464 * before {@link #addValue(double[]) addValue} has been used to add data;
465 * otherwise an IllegalStateException will be thrown.</p>
466 *
467 * @param sumsqImpl the StorelessUnivariateStatistic instance to use
468 * for computing the sum of squares
469 * @throws DimensionMismatchException if the array dimension
470 * does not match the one used at construction
471 * @throws IllegalStateException if data has already been added
472 * (i.e if n > 0)
473 */
474 public void setSumsqImpl(StorelessUnivariateStatistic[] sumsqImpl)
475 throws DimensionMismatchException {
476 setImpl(sumsqImpl, this.sumSqImpl);
477 }
478
479 /**
480 * Returns the currently configured minimum implementation
481 *
482 * @return the StorelessUnivariateStatistic implementing the minimum
483 */
484 public StorelessUnivariateStatistic[] getMinImpl() {
485 return minImpl.clone();
486 }
487
488 /**
489 * <p>Sets the implementation for the minimum.</p>
490 * <p>This method must be activated before any data has been added - i.e.,
491 * before {@link #addValue(double[]) addValue} has been used to add data;
492 * otherwise an IllegalStateException will be thrown.</p>
493 *
494 * @param minImpl the StorelessUnivariateStatistic instance to use
495 * for computing the minimum
496 * @throws DimensionMismatchException if the array dimension
497 * does not match the one used at construction
498 * @throws IllegalStateException if data has already been added
499 * (i.e if n > 0)
500 */
501 public void setMinImpl(StorelessUnivariateStatistic[] minImpl)
502 throws DimensionMismatchException {
503 setImpl(minImpl, this.minImpl);
504 }
505
506 /**
507 * Returns the currently configured maximum implementation
508 *
509 * @return the StorelessUnivariateStatistic implementing the maximum
510 */
511 public StorelessUnivariateStatistic[] getMaxImpl() {
512 return maxImpl.clone();
513 }
514
515 /**
516 * <p>Sets the implementation for the maximum.</p>
517 * <p>This method must be activated before any data has been added - i.e.,
518 * before {@link #addValue(double[]) addValue} has been used to add data;
519 * otherwise an IllegalStateException will be thrown.</p>
520 *
521 * @param maxImpl the StorelessUnivariateStatistic instance to use
522 * for computing the maximum
523 * @throws DimensionMismatchException if the array dimension
524 * does not match the one used at construction
525 * @throws IllegalStateException if data has already been added
526 * (i.e if n > 0)
527 */
528 public void setMaxImpl(StorelessUnivariateStatistic[] maxImpl)
529 throws DimensionMismatchException {
530 setImpl(maxImpl, this.maxImpl);
531 }
532
533 /**
534 * Returns the currently configured sum of logs implementation
535 *
536 * @return the StorelessUnivariateStatistic implementing the log sum
537 */
538 public StorelessUnivariateStatistic[] getSumLogImpl() {
539 return sumLogImpl.clone();
540 }
541
542 /**
543 * <p>Sets the implementation for the sum of logs.</p>
544 * <p>This method must be activated before any data has been added - i.e.,
545 * before {@link #addValue(double[]) addValue} has been used to add data;
546 * otherwise an IllegalStateException will be thrown.</p>
547 *
548 * @param sumLogImpl the StorelessUnivariateStatistic instance to use
549 * for computing the log sum
550 * @throws DimensionMismatchException if the array dimension
551 * does not match the one used at construction
552 * @throws IllegalStateException if data has already been added
553 * (i.e if n > 0)
554 */
555 public void setSumLogImpl(StorelessUnivariateStatistic[] sumLogImpl)
556 throws DimensionMismatchException {
557 setImpl(sumLogImpl, this.sumLogImpl);
558 }
559
560 /**
561 * Returns the currently configured geometric mean implementation
562 *
563 * @return the StorelessUnivariateStatistic implementing the geometric mean
564 */
565 public StorelessUnivariateStatistic[] getGeoMeanImpl() {
566 return geoMeanImpl.clone();
567 }
568
569 /**
570 * <p>Sets the implementation for the geometric mean.</p>
571 * <p>This method must be activated before any data has been added - i.e.,
572 * before {@link #addValue(double[]) addValue} has been used to add data;
573 * otherwise an IllegalStateException will be thrown.</p>
574 *
575 * @param geoMeanImpl the StorelessUnivariateStatistic instance to use
576 * for computing the geometric mean
577 * @throws DimensionMismatchException if the array dimension
578 * does not match the one used at construction
579 * @throws IllegalStateException if data has already been added
580 * (i.e if n > 0)
581 */
582 public void setGeoMeanImpl(StorelessUnivariateStatistic[] geoMeanImpl)
583 throws DimensionMismatchException {
584 setImpl(geoMeanImpl, this.geoMeanImpl);
585 }
586
587 /**
588 * Returns the currently configured mean implementation
589 *
590 * @return the StorelessUnivariateStatistic implementing the mean
591 */
592 public StorelessUnivariateStatistic[] getMeanImpl() {
593 return meanImpl.clone();
594 }
595
596 /**
597 * <p>Sets the implementation for the mean.</p>
598 * <p>This method must be activated before any data has been added - i.e.,
599 * before {@link #addValue(double[]) addValue} has been used to add data;
600 * otherwise an IllegalStateException will be thrown.</p>
601 *
602 * @param meanImpl the StorelessUnivariateStatistic instance to use
603 * for computing the mean
604 * @throws DimensionMismatchException if the array dimension
605 * does not match the one used at construction
606 * @throws IllegalStateException if data has already been added
607 * (i.e if n > 0)
608 */
609 public void setMeanImpl(StorelessUnivariateStatistic[] meanImpl)
610 throws DimensionMismatchException {
611 setImpl(meanImpl, this.meanImpl);
612 }
613
614 /**
615 * Throws IllegalStateException if n > 0.
616 */
617 private void checkEmpty() {
618 if (n > 0) {
619 throw MathRuntimeException.createIllegalStateException(
620 LocalizedFormats.VALUES_ADDED_BEFORE_CONFIGURING_STATISTIC,
621 n);
622 }
623 }
624
625 /**
626 * Throws DimensionMismatchException if dimension != k.
627 * @param dimension dimension to check
628 * @throws DimensionMismatchException if dimension != k
629 */
630 private void checkDimension(int dimension)
631 throws DimensionMismatchException {
632 if (dimension != k) {
633 throw new DimensionMismatchException(dimension, k);
634 }
635 }
636
637 }