HistogramCalculator.java
package com.seebie.server.service;
import com.seebie.server.dto.StackedHistograms;
import java.util.*;
import static java.lang.Math.round;
import static java.util.function.Function.identity;
import static java.util.stream.Collectors.*;
import static java.util.stream.LongStream.iterate;
public class HistogramCalculator {
/**
* Return the histogram for comparison between multiple data sets of different sizes.
* The normalized count is the count in a class divided by the total number of observations.
* In this case the relative counts are normalized to sum to 100 since a percentage scale is used.
*
* @param binSize
* @param multiDataSets
* @return
*/
public StackedHistograms buildNormalizedHistogram(final int binSize, final List<List<Long>> multiDataSets) {
// build a complete set of bins that can account for all the data sets
// if there was no data in any of the incoming data sets, the bin list will be the empty set
LongSummaryStatistics stats = multiDataSets.stream()
.flatMap(List::stream)
.collect(summarizingLong(i->i));
long lowestBin = (stats.getMin() / binSize) * binSize; // get the lowest bin that is a multiple of binSize
List<Long> unifiedBins = iterate(lowestBin, b -> b <= stats.getMax(), b -> b + binSize)
.boxed()
.toList();
var stackedNormalizedHist = multiDataSets.stream()
.map(durations -> buildHistogram(unifiedBins, durations))
.map(this::normalizeValues)
.toList();
return new StackedHistograms(unifiedBins, stackedNormalizedHist);
}
/**
* Build a histogram of the data.
* The histogram is a map of bin lower bound to count of values in that bin.
* A bin lower bound is the value divided by the bin size, rounded down, times the bin size.
* A bin upper bound is the bin lower bound plus the bin size.
* A bin is a closed interval at the bottom and open at the top.
*/
private Map<Long, Long> buildHistogram(List<Long> binLowerBounds, List<Long> durationMinutes) {
NavigableMap<Long, Long> histogram = new TreeMap<>();
binLowerBounds.forEach(lower -> histogram.put(lower, 0L));
var foundValues = durationMinutes.stream()
.map(histogram::floorKey)
.collect(groupingBy(identity(), counting()));
histogram.putAll(foundValues);
return histogram;
}
private List<Long> normalizeValues(Map<Long, Long> histogram) {
var totalObservations = histogram.values().stream().mapToDouble(Double::valueOf).sum();
return histogram.values().stream()
.map(value -> round(100 * value / totalObservations))
.toList();
}
}