Adedeji Adebisi | 12c5f11 | 2021-07-22 18:07:52 +0000 | [diff] [blame] | 1 | // Copyright 2021 Google LLC |
| 2 | // |
| 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | // you may not use this file except in compliance with the License. |
| 5 | // You may obtain a copy of the License at |
| 6 | // |
| 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | // |
| 9 | // Unless required by applicable law or agreed to in writing, software |
| 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | // See the License for the specific language governing permissions and |
| 13 | // limitations under the License. |
| 14 | |
| 15 | #pragma once |
| 16 | #include <math.h> |
| 17 | #include <stdio.h> |
| 18 | |
| 19 | #include <vector> |
| 20 | template <typename ValueType> |
| 21 | class Histogram |
| 22 | { |
| 23 | public: |
| 24 | Histogram() |
| 25 | { |
| 26 | num_entries_ = 0; |
| 27 | num_low_outliers_ = 0; |
| 28 | num_high_outliers_ = 0; |
| 29 | low_percentile_ = 0; |
| 30 | high_percentile_ = 0; |
| 31 | SetWindowSize(10000); |
| 32 | SetCumulativeDensityThresholds(0.01f, 0.99f); |
| 33 | } |
| 34 | |
| 35 | void AddSample(ValueType s) |
| 36 | { |
| 37 | int N = static_cast<int>(samples_.size()); |
| 38 | samples_[num_entries_ % N] = s; |
| 39 | num_entries_++; |
| 40 | } |
| 41 | |
| 42 | void SetBucketCount(int bc) |
| 43 | { |
| 44 | buckets_.resize(bc); |
| 45 | } |
| 46 | |
| 47 | void SetWindowSize(int s) |
| 48 | { |
| 49 | samples_.resize(s); |
| 50 | } |
| 51 | |
| 52 | void SetCumulativeDensityThresholds(float low_cd, float high_cd) |
| 53 | { |
| 54 | low_cum_density_ = low_cd; |
| 55 | high_cum_density_ = high_cd; |
| 56 | } |
| 57 | |
| 58 | void ComputeHistogram() |
| 59 | { |
| 60 | const int NS = static_cast<int>(samples_.size()); |
| 61 | int N = NS; |
| 62 | if (num_entries_ < N) |
| 63 | { |
| 64 | N = num_entries_; |
| 65 | } |
| 66 | if (N <= 0) |
| 67 | { |
| 68 | return; |
| 69 | } |
| 70 | std::vector<ValueType> sorted; |
| 71 | if (N == NS) |
| 72 | sorted = samples_; |
| 73 | else |
| 74 | sorted.insert(sorted.end(), samples_.begin(), samples_.begin() + N); |
| 75 | sort(sorted.begin(), sorted.end()); |
| 76 | int idx_low = static_cast<int>(N * low_cum_density_); |
| 77 | int idx_high = static_cast<int>(N * high_cum_density_); |
| 78 | if (idx_high - idx_low + 1 < 1) |
| 79 | { |
| 80 | return; // No entries can be shown, quit |
| 81 | } |
| 82 | max_bucket_height_ = 0; |
| 83 | ValueType value_low = sorted[idx_low]; |
| 84 | ValueType value_high = sorted[idx_high]; |
| 85 | low_percentile_ = value_low; |
| 86 | high_percentile_ = value_high; |
| 87 | const int NB = static_cast<int>(buckets_.size()); // Number of Bins |
| 88 | float bucket_width = (value_high - value_low) / NB; |
| 89 | // If all values are the same, manually extend the range to |
| 90 | // (value-1, value+1) |
| 91 | const float EPS = 1e-4; |
| 92 | if (fabs(value_high - value_low) <= EPS) |
| 93 | { |
| 94 | value_low = value_low - 1; |
| 95 | value_high = value_high + 1; |
| 96 | bucket_width = (value_high - value_low) / NB; |
| 97 | } |
| 98 | else |
| 99 | {} |
| 100 | buckets_.assign(NB, 0); |
| 101 | num_low_outliers_ = 0; |
| 102 | num_high_outliers_ = 0; |
| 103 | for (int i = idx_low; i <= idx_high; i++) |
| 104 | { |
| 105 | ValueType v = sorted[i]; |
| 106 | ValueType dist = (v - value_low); |
| 107 | int bucket_idx = dist / bucket_width; |
| 108 | if (bucket_idx < 0) |
| 109 | { |
| 110 | num_low_outliers_++; |
| 111 | } |
| 112 | else if (bucket_idx >= NB) |
| 113 | { |
| 114 | num_high_outliers_++; |
| 115 | } |
| 116 | else |
| 117 | { |
| 118 | buckets_[bucket_idx]++; |
| 119 | max_bucket_height_ = |
| 120 | std::max(max_bucket_height_, buckets_[bucket_idx]); |
| 121 | } |
| 122 | } |
| 123 | } |
| 124 | |
| 125 | int BucketHeight(int idx) |
| 126 | { |
| 127 | if (idx < 0 || idx >= static_cast<int>(buckets_.size())) |
| 128 | return 0; |
| 129 | return buckets_[idx]; |
| 130 | } |
| 131 | |
| 132 | void Assign(Histogram<ValueType>* out) |
| 133 | { |
| 134 | out->num_entries_ = num_entries_; |
| 135 | out->samples_ = samples_; |
| 136 | out->num_low_outliers_ = num_low_outliers_; |
| 137 | out->num_high_outliers_ = num_high_outliers_; |
| 138 | out->buckets_ = buckets_; |
| 139 | out->low_cum_density_ = low_cum_density_; |
| 140 | out->high_cum_density_ = high_cum_density_; |
| 141 | out->low_percentile_ = low_percentile_; |
| 142 | out->high_percentile_ = high_percentile_; |
| 143 | out->max_bucket_height_ = max_bucket_height_; |
| 144 | } |
| 145 | |
| 146 | int MaxBucketHeight() |
| 147 | { |
| 148 | return max_bucket_height_; |
| 149 | } |
| 150 | |
| 151 | ValueType LowPercentile() |
| 152 | { |
| 153 | return low_percentile_; |
| 154 | } |
| 155 | |
| 156 | ValueType HighPercentile() |
| 157 | { |
| 158 | return high_percentile_; |
| 159 | } |
| 160 | |
| 161 | float LowCumDensity() |
| 162 | { |
| 163 | return low_cum_density_; |
| 164 | } |
| 165 | |
| 166 | float HighCumDensity() |
| 167 | { |
| 168 | return high_cum_density_; |
| 169 | } |
| 170 | |
| 171 | bool Empty() |
| 172 | { |
| 173 | return num_entries_ == 0; |
| 174 | } |
| 175 | |
| 176 | int num_entries_; |
| 177 | std::vector<ValueType> samples_; |
| 178 | int num_low_outliers_, num_high_outliers_; |
| 179 | std::vector<int> buckets_; |
| 180 | float low_cum_density_, high_cum_density_; |
| 181 | // "1% percentile" means "1% of the samples are below this value" |
| 182 | ValueType low_percentile_, high_percentile_; |
| 183 | int max_bucket_height_; |
| 184 | }; |