001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017 package org.apache.commons.math.stat.descriptive.rank;
018
019 import java.io.Serializable;
020 import java.util.Arrays;
021
022 import org.apache.commons.math.MathRuntimeException;
023 import org.apache.commons.math.exception.util.LocalizedFormats;
024 import org.apache.commons.math.stat.descriptive.AbstractUnivariateStatistic;
025 import org.apache.commons.math.util.FastMath;
026
027 /**
028 * Provides percentile computation.
029 * <p>
030 * There are several commonly used methods for estimating percentiles (a.k.a.
031 * quantiles) based on sample data. For large samples, the different methods
032 * agree closely, but when sample sizes are small, different methods will give
033 * significantly different results. The algorithm implemented here works as follows:
034 * <ol>
035 * <li>Let <code>n</code> be the length of the (sorted) array and
036 * <code>0 < p <= 100</code> be the desired percentile.</li>
037 * <li>If <code> n = 1 </code> return the unique array element (regardless of
038 * the value of <code>p</code>); otherwise </li>
039 * <li>Compute the estimated percentile position
040 * <code> pos = p * (n + 1) / 100</code> and the difference, <code>d</code>
041 * between <code>pos</code> and <code>floor(pos)</code> (i.e. the fractional
042 * part of <code>pos</code>). If <code>pos >= n</code> return the largest
043 * element in the array; otherwise</li>
044 * <li>Let <code>lower</code> be the element in position
045 * <code>floor(pos)</code> in the array and let <code>upper</code> be the
046 * next element in the array. Return <code>lower + d * (upper - lower)</code>
047 * </li>
048 * </ol></p>
049 * <p>
050 * To compute percentiles, the data must be at least partially ordered. Input
051 * arrays are copied and recursively partitioned using an ordering definition.
052 * The ordering used by <code>Arrays.sort(double[])</code> is the one determined
053 * by {@link java.lang.Double#compareTo(Double)}. This ordering makes
054 * <code>Double.NaN</code> larger than any other value (including
055 * <code>Double.POSITIVE_INFINITY</code>). Therefore, for example, the median
056 * (50th percentile) of
057 * <code>{0, 1, 2, 3, 4, Double.NaN}</code> evaluates to <code>2.5.</code></p>
058 * <p>
059 * Since percentile estimation usually involves interpolation between array
060 * elements, arrays containing <code>NaN</code> or infinite values will often
061 * result in <code>NaN<code> or infinite values returned.</p>
062 * <p>
063 * Since 2.2, Percentile implementation uses only selection instead of complete
064 * sorting and caches selection algorithm state between calls to the various
065 * {@code evaluate} methods when several percentiles are to be computed on the same data.
066 * This greatly improves efficiency, both for single percentile and multiple
067 * percentiles computations. However, it also induces a need to be sure the data
068 * at one call to {@code evaluate} is the same as the data with the cached algorithm
069 * state from the previous calls. Percentile does this by checking the array reference
070 * itself and a checksum of its content by default. If the user already knows he calls
071 * {@code evaluate} on an immutable array, he can save the checking time by calling the
072 * {@code evaluate} methods that do <em>not</em>
073 * </p>
074 * <p>
075 * <strong>Note that this implementation is not synchronized.</strong> If
076 * multiple threads access an instance of this class concurrently, and at least
077 * one of the threads invokes the <code>increment()</code> or
078 * <code>clear()</code> method, it must be synchronized externally.</p>
079 *
080 * @version $Revision: 1006299 $ $Date: 2010-10-10 16:47:17 +0200 (dim. 10 oct. 2010) $
081 */
082 public class Percentile extends AbstractUnivariateStatistic implements Serializable {
083
084 /** Serializable version identifier */
085 private static final long serialVersionUID = -8091216485095130416L;
086
087 /** Minimum size under which we use a simple insertion sort rather than Hoare's select. */
088 private static final int MIN_SELECT_SIZE = 15;
089
090 /** Maximum number of partitioning pivots cached (each level double the number of pivots). */
091 private static final int MAX_CACHED_LEVELS = 10;
092
093 /** Determines what percentile is computed when evaluate() is activated
094 * with no quantile argument */
095 private double quantile = 0.0;
096
097 /** Cached pivots. */
098 private int[] cachedPivots;
099
100 /**
101 * Constructs a Percentile with a default quantile
102 * value of 50.0.
103 */
104 public Percentile() {
105 this(50.0);
106 }
107
108 /**
109 * Constructs a Percentile with the specific quantile value.
110 * @param p the quantile
111 * @throws IllegalArgumentException if p is not greater than 0 and less
112 * than or equal to 100
113 */
114 public Percentile(final double p) {
115 setQuantile(p);
116 cachedPivots = null;
117 }
118
119 /**
120 * Copy constructor, creates a new {@code Percentile} identical
121 * to the {@code original}
122 *
123 * @param original the {@code Percentile} instance to copy
124 */
125 public Percentile(Percentile original) {
126 copy(original, this);
127 }
128
129 /** {@inheritDoc} */
130 @Override
131 public void setData(final double[] values) {
132 if (values == null) {
133 cachedPivots = null;
134 } else {
135 cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
136 Arrays.fill(cachedPivots, -1);
137 }
138 super.setData(values);
139 }
140
141 /** {@inheritDoc} */
142 @Override
143 public void setData(final double[] values, final int begin, final int length) {
144 if (values == null) {
145 cachedPivots = null;
146 } else {
147 cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
148 Arrays.fill(cachedPivots, -1);
149 }
150 super.setData(values, begin, length);
151 }
152
153 /**
154 * Returns the result of evaluating the statistic over the stored data.
155 * <p>
156 * The stored array is the one which was set by previous calls to
157 * </p>
158 * @param p the percentile value to compute
159 * @return the value of the statistic applied to the stored data
160 */
161 public double evaluate(final double p) {
162 return evaluate(getDataRef(), p);
163 }
164
165 /**
166 * Returns an estimate of the <code>p</code>th percentile of the values
167 * in the <code>values</code> array.
168 * <p>
169 * Calls to this method do not modify the internal <code>quantile</code>
170 * state of this statistic.</p>
171 * <p>
172 * <ul>
173 * <li>Returns <code>Double.NaN</code> if <code>values</code> has length
174 * <code>0</code></li>
175 * <li>Returns (for any value of <code>p</code>) <code>values[0]</code>
176 * if <code>values</code> has length <code>1</code></li>
177 * <li>Throws <code>IllegalArgumentException</code> if <code>values</code>
178 * is null or p is not a valid quantile value (p must be greater than 0
179 * and less than or equal to 100) </li>
180 * </ul></p>
181 * <p>
182 * See {@link Percentile} for a description of the percentile estimation
183 * algorithm used.</p>
184 *
185 * @param values input array of values
186 * @param p the percentile value to compute
187 * @return the percentile value or Double.NaN if the array is empty
188 * @throws IllegalArgumentException if <code>values</code> is null
189 * or p is invalid
190 */
191 public double evaluate(final double[] values, final double p) {
192 test(values, 0, 0);
193 return evaluate(values, 0, values.length, p);
194 }
195
196 /**
197 * Returns an estimate of the <code>quantile</code>th percentile of the
198 * designated values in the <code>values</code> array. The quantile
199 * estimated is determined by the <code>quantile</code> property.
200 * <p>
201 * <ul>
202 * <li>Returns <code>Double.NaN</code> if <code>length = 0</code></li>
203 * <li>Returns (for any value of <code>quantile</code>)
204 * <code>values[begin]</code> if <code>length = 1 </code></li>
205 * <li>Throws <code>IllegalArgumentException</code> if <code>values</code>
206 * is null, or <code>start</code> or <code>length</code>
207 * is invalid</li>
208 * </ul></p>
209 * <p>
210 * See {@link Percentile} for a description of the percentile estimation
211 * algorithm used.</p>
212 *
213 * @param values the input array
214 * @param start index of the first array element to include
215 * @param length the number of elements to include
216 * @return the percentile value
217 * @throws IllegalArgumentException if the parameters are not valid
218 *
219 */
220 @Override
221 public double evaluate( final double[] values, final int start, final int length) {
222 return evaluate(values, start, length, quantile);
223 }
224
225 /**
226 * Returns an estimate of the <code>p</code>th percentile of the values
227 * in the <code>values</code> array, starting with the element in (0-based)
228 * position <code>begin</code> in the array and including <code>length</code>
229 * values.
230 * <p>
231 * Calls to this method do not modify the internal <code>quantile</code>
232 * state of this statistic.</p>
233 * <p>
234 * <ul>
235 * <li>Returns <code>Double.NaN</code> if <code>length = 0</code></li>
236 * <li>Returns (for any value of <code>p</code>) <code>values[begin]</code>
237 * if <code>length = 1 </code></li>
238 * <li>Throws <code>IllegalArgumentException</code> if <code>values</code>
239 * is null , <code>begin</code> or <code>length</code> is invalid, or
240 * <code>p</code> is not a valid quantile value (p must be greater than 0
241 * and less than or equal to 100)</li>
242 * </ul></p>
243 * <p>
244 * See {@link Percentile} for a description of the percentile estimation
245 * algorithm used.</p>
246 *
247 * @param values array of input values
248 * @param p the percentile to compute
249 * @param begin the first (0-based) element to include in the computation
250 * @param length the number of array elements to include
251 * @return the percentile value
252 * @throws IllegalArgumentException if the parameters are not valid or the
253 * input array is null
254 */
255 public double evaluate(final double[] values, final int begin,
256 final int length, final double p) {
257
258 test(values, begin, length);
259
260 if ((p > 100) || (p <= 0)) {
261 throw MathRuntimeException.createIllegalArgumentException(
262 LocalizedFormats.OUT_OF_BOUNDS_QUANTILE_VALUE, p);
263 }
264 if (length == 0) {
265 return Double.NaN;
266 }
267 if (length == 1) {
268 return values[begin]; // always return single value for n = 1
269 }
270 double n = length;
271 double pos = p * (n + 1) / 100;
272 double fpos = FastMath.floor(pos);
273 int intPos = (int) fpos;
274 double dif = pos - fpos;
275 double[] work;
276 int[] pivotsHeap;
277 if (values == getDataRef()) {
278 work = getDataRef();
279 pivotsHeap = cachedPivots;
280 } else {
281 work = new double[length];
282 System.arraycopy(values, begin, work, 0, length);
283 pivotsHeap = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
284 Arrays.fill(pivotsHeap, -1);
285 }
286
287 if (pos < 1) {
288 return select(work, pivotsHeap, 0);
289 }
290 if (pos >= n) {
291 return select(work, pivotsHeap, length - 1);
292 }
293 double lower = select(work, pivotsHeap, intPos - 1);
294 double upper = select(work, pivotsHeap, intPos);
295 return lower + dif * (upper - lower);
296 }
297
298 /**
299 * Select the k<sup>th</sup> smallest element from work array
300 * @param work work array (will be reorganized during the call)
301 * @param pivotsHeap set of pivot index corresponding to elements that
302 * are already at their sorted location, stored as an implicit heap
303 * (i.e. a sorted binary tree stored in a flat array, where the
304 * children of a node at index n are at indices 2n+1 for the left
305 * child and 2n+2 for the right child, with 0-based indices)
306 * @param k index of the desired element
307 * @return k<sup>th</sup> smallest element
308 */
309 private double select(final double[] work, final int[] pivotsHeap, final int k) {
310
311 int begin = 0;
312 int end = work.length;
313 int node = 0;
314
315 while (end - begin > MIN_SELECT_SIZE) {
316
317 final int pivot;
318 if ((node < pivotsHeap.length) && (pivotsHeap[node] >= 0)) {
319 // the pivot has already been found in a previous call
320 // and the array has already been partitioned around it
321 pivot = pivotsHeap[node];
322 } else {
323 // select a pivot and partition work array around it
324 pivot = partition(work, begin, end, medianOf3(work, begin, end));
325 if (node < pivotsHeap.length) {
326 pivotsHeap[node] = pivot;
327 }
328 }
329
330 if (k == pivot) {
331 // the pivot was exactly the element we wanted
332 return work[k];
333 } else if (k < pivot) {
334 // the element is in the left partition
335 end = pivot;
336 node = Math.min(2 * node + 1, pivotsHeap.length); // the min is here to avoid integer overflow
337 } else {
338 // the element is in the right partition
339 begin = pivot + 1;
340 node = Math.min(2 * node + 2, pivotsHeap.length); // the min is here to avoid integer overflow
341 }
342
343 }
344
345 // the element is somewhere in the small sub-array
346 // sort the sub-array using insertion sort
347 insertionSort(work, begin, end);
348 return work[k];
349
350 }
351
352 /** Select a pivot index as the median of three
353 * @param work data array
354 * @param begin index of the first element of the slice
355 * @param end index after the last element of the slice
356 * @return the index of the median element chosen between the
357 * first, the middle and the last element of the array slice
358 */
359 int medianOf3(final double[] work, final int begin, final int end) {
360
361 final int inclusiveEnd = end - 1;
362 final int middle = begin + (inclusiveEnd - begin) / 2;
363 final double wBegin = work[begin];
364 final double wMiddle = work[middle];
365 final double wEnd = work[inclusiveEnd];
366
367 if (wBegin < wMiddle) {
368 if (wMiddle < wEnd) {
369 return middle;
370 } else {
371 return (wBegin < wEnd) ? inclusiveEnd : begin;
372 }
373 } else {
374 if (wBegin < wEnd) {
375 return begin;
376 } else {
377 return (wMiddle < wEnd) ? inclusiveEnd : middle;
378 }
379 }
380
381 }
382
383 /**
384 * Partition an array slice around a pivot
385 * <p>
386 * Partitioning exchanges array elements such that all elements
387 * smaller than pivot are before it and all elements larger than
388 * pivot are after it
389 * </p>
390 * @param work data array
391 * @param begin index of the first element of the slice
392 * @param end index after the last element of the slice
393 * @param pivot initial index of the pivot
394 * @return index of the pivot after partition
395 */
396 private int partition(final double[] work, final int begin, final int end, final int pivot) {
397
398 final double value = work[pivot];
399 work[pivot] = work[begin];
400
401 int i = begin + 1;
402 int j = end - 1;
403 while (i < j) {
404 while ((i < j) && (work[j] >= value)) {
405 --j;
406 }
407 while ((i < j) && (work[i] <= value)) {
408 ++i;
409 }
410
411 if (i < j) {
412 final double tmp = work[i];
413 work[i++] = work[j];
414 work[j--] = tmp;
415 }
416 }
417
418 if ((i >= end) || (work[i] > value)) {
419 --i;
420 }
421 work[begin] = work[i];
422 work[i] = value;
423 return i;
424
425 }
426
427 /**
428 * Sort in place a (small) array slice using insertion sort
429 * @param work array to sort
430 * @param begin index of the first element of the slice to sort
431 * @param end index after the last element of the slice to sort
432 */
433 private void insertionSort(final double[] work, final int begin, final int end) {
434 for (int j = begin + 1; j < end; j++) {
435 final double saved = work[j];
436 int i = j - 1;
437 while ((i >= begin) && (saved < work[i])) {
438 work[i + 1] = work[i];
439 i--;
440 }
441 work[i + 1] = saved;
442 }
443 }
444
445 /**
446 * Returns the value of the quantile field (determines what percentile is
447 * computed when evaluate() is called with no quantile argument).
448 *
449 * @return quantile
450 */
451 public double getQuantile() {
452 return quantile;
453 }
454
455 /**
456 * Sets the value of the quantile field (determines what percentile is
457 * computed when evaluate() is called with no quantile argument).
458 *
459 * @param p a value between 0 < p <= 100
460 * @throws IllegalArgumentException if p is not greater than 0 and less
461 * than or equal to 100
462 */
463 public void setQuantile(final double p) {
464 if (p <= 0 || p > 100) {
465 throw MathRuntimeException.createIllegalArgumentException(
466 LocalizedFormats.OUT_OF_BOUNDS_QUANTILE_VALUE, p);
467 }
468 quantile = p;
469 }
470
471 /**
472 * {@inheritDoc}
473 */
474 @Override
475 public Percentile copy() {
476 Percentile result = new Percentile();
477 copy(this, result);
478 return result;
479 }
480
481 /**
482 * Copies source to dest.
483 * <p>Neither source nor dest can be null.</p>
484 *
485 * @param source Percentile to copy
486 * @param dest Percentile to copy to
487 * @throws NullPointerException if either source or dest is null
488 */
489 public static void copy(Percentile source, Percentile dest) {
490 dest.setData(source.getDataRef());
491 if (source.cachedPivots != null) {
492 System.arraycopy(source.cachedPivots, 0, dest.cachedPivots, 0, source.cachedPivots.length);
493 }
494 dest.quantile = source.quantile;
495 }
496
497 }