Make index_for use usize, simplify initialization logic, add comments

Marshall Pierce · Marshall Pierce · commit 64fb32691c39 · 2017-02-17T21:11:30.000-08:00
diff --git a/src/lib.rs b/src/lib.rs
@@ -297,7 +297,7 @@ impl<T: Counter> Histogram<T> {
     /// Find the bucket the given value should be placed in.
     ///
     /// May panic if the given value falls outside the current range of the histogram.
-    fn index_for(&self, value: u64) -> isize {
+    fn index_for(&self, value: u64) -> usize {
         let bucket_index = self.bucket_for(value);
         let sub_bucket_index = self.sub_bucket_for(value, bucket_index);
 
@@ -307,27 +307,26 @@ impl<T: Counter> Histogram<T> {
         // Calculate the index for the first entry that will be used in the bucket (halfway through
         // sub_bucket_count). For bucket_index 0, all sub_bucket_count entries may be used, but
         // bucket_base_index is still set in the middle.
-        let bucket_base_index = (bucket_index as isize + 1) << self.sub_bucket_half_count_magnitude;
+        let bucket_base_index = (bucket_index as usize + 1) << self.sub_bucket_half_count_magnitude;
 
         // Calculate the offset in the bucket. This subtraction will result in a positive value in
         // all buckets except the 0th bucket (since a value in that bucket may be less than half
         // the bucket's 0 to sub_bucket_count range). However, this works out since we give bucket 0
         // twice as much space.
         let offset_in_bucket = sub_bucket_index as isize - self.sub_bucket_half_count as isize;
 
-        // The following is the equivalent of
-        // ((sub_bucket_index  - sub_bucket_half_count) + bucket_base_index;
-        (bucket_base_index + offset_in_bucket)
+        let index = bucket_base_index as isize + offset_in_bucket;
+        // This is always non-negative because offset_in_bucket is only negative (and only then by
+        // sub_bucket_half_count at most) for bucket 0, and bucket_base_index will be halfway into
+        // bucket 0's sub buckets in that case.
+        debug_assert!(index >= 0);
+        return index as usize;
     }
 
     /// Get a mutable reference to the count bucket for the given value, if it is in range.
     fn mut_at(&mut self, value: u64) -> Option<&mut T> {
         let i = self.index_for(value);
-        if i < 0 {
-            return None;
-        }
-
-        self.counts.get_mut(i as usize)
+        self.counts.get_mut(i)
     }
 
     // ********************************************************************************************
@@ -426,7 +425,7 @@ impl<T: Counter> Histogram<T> {
             // and add each non-zero value found at it's proper value:
 
             // Do max value first, to avoid max value updates on each iteration:
-            let other_max_index = source.index_for(source.max()) as usize;
+            let other_max_index = source.index_for(source.max());
             let other_count = source[other_max_index];
             self.record_n(source.value_for(other_max_index), other_count).unwrap();
 
@@ -618,7 +617,7 @@ impl<T: Counter> Histogram<T> {
         // largest value with single unit resolution, in [2, 200_000].
         let largest = 2 * 10_u32.pow(sigfig as u32);
 
-        let unit_magnitude = ((low as f64).log2() / 2_f64.log2()).floor() as u8;
+        let unit_magnitude = (low as f64).log2().floor() as u8;
         let unit_magnitude_mask = (1 << unit_magnitude) - 1;
 
         // We need to maintain power-of-two sub_bucket_count (for clean direct indexing) that is
@@ -628,15 +627,11 @@ impl<T: Counter> Histogram<T> {
         // that.
         // In [1, 18]. 2^18 > 2 * 10^5 (the largest possible
         // largest_value_with_single_unit_resolution)
-        let sub_bucket_count_magnitude = ((largest as f64).log2() / 2_f64.log2()).ceil() as u8;
-        let sub_bucket_half_count_magnitude = if sub_bucket_count_magnitude > 1 {
-            sub_bucket_count_magnitude
-        } else {
-            1
-        } - 1;
-        let sub_bucket_count = 2_u32.pow(sub_bucket_half_count_magnitude as u32 + 1);
+        let sub_bucket_count_magnitude = (largest as f64).log2().ceil() as u8;
+        let sub_bucket_half_count_magnitude = sub_bucket_count_magnitude - 1;
+        let sub_bucket_count = 1_u32 << (sub_bucket_count_magnitude as u32);
 
-        if unit_magnitude + sub_bucket_half_count_magnitude + 1 > 63 {
+        if unit_magnitude + sub_bucket_count_magnitude > 63 {
             // Cannot represent shifted sub bucket count in 64 bits.
             // This will cause an infinite loop when calculating number of buckets
             return Err("Cannot represent significant figures' worth of measurements beyond lowest value");
@@ -660,7 +655,7 @@ impl<T: Counter> Histogram<T> {
 
             // Establish leading_zero_count_base, used in bucket_index_of() fast path:
             // subtract the bits that would be used by the largest value in bucket 0.
-            leading_zero_count_base: (64 - unit_magnitude - sub_bucket_half_count_magnitude - 1),
+            leading_zero_count_base: 64 - unit_magnitude - sub_bucket_count_magnitude,
             sub_bucket_half_count_magnitude: sub_bucket_half_count_magnitude,
 
             unit_magnitude: unit_magnitude,
@@ -1089,7 +1084,7 @@ impl<T: Counter> Histogram<T> {
             return 100.0;
         }
 
-        let target_index = cmp::min(self.index_for(value), self.last() as isize) as usize;
+        let target_index = cmp::min(self.index_for(value), self.last());
         let total_to_current_index =
             (0..(target_index + 1)).map(|i| self[i]).fold(T::zero(), |t, v| t + v);
         100.0 * total_to_current_index.to_f64().unwrap() / self.total_count as f64
@@ -1107,8 +1102,8 @@ impl<T: Counter> Histogram<T> {
     /// May fail if the given values are out of bounds.
     pub fn count_between(&self, low: u64, high: u64) -> Result<T, ()> {
         use std::cmp;
-        let low_index = cmp::max(0, self.index_for(low)) as usize;
-        let high_index = cmp::min(self.index_for(high), self.last() as isize) as usize;
+        let low_index = self.index_for(low);
+        let high_index = cmp::min(self.index_for(high), self.last());
         Ok((low_index..(high_index + 1)).map(|i| self[i]).fold(T::zero(), |t, v| t + v))
     }
 
@@ -1121,7 +1116,7 @@ impl<T: Counter> Histogram<T> {
     /// May fail if the given value is out of bounds.
     pub fn count_at(&self, value: u64) -> Result<T, ()> {
         use std::cmp;
-        Ok(self[cmp::min(cmp::max(0, self.index_for(value)), self.last() as isize) as usize])
+        Ok(self[cmp::min(self.index_for(value), self.last())])
     }
 
     // ********************************************************************************************
@@ -1206,6 +1201,7 @@ impl<T: Counter> Histogram<T> {
         let bucket_index = self.bucket_for(value);
         let sub_bucket_index = self.sub_bucket_for(value, bucket_index);
         // calculate distance to next value
+        // TODO when is sub_bucket_index >= sub_bucket_count?
         1_u64 <<
         (self.unit_magnitude +
          if sub_bucket_index >= self.sub_bucket_count {
@@ -1219,27 +1215,27 @@ impl<T: Counter> Histogram<T> {
     // Internal helpers
     // ********************************************************************************************
 
-    /// Compute the lowest (and therefore highest precision) bucket index that can represent the
-    /// value.
+    /// Compute the lowest (and therefore highest precision) bucket index whose sub-buckets can
+    /// represent the value.
     #[inline]
     fn bucket_for(&self, value: u64) -> u8 {
         // Calculates the number of powers of two by which the value is greater than the biggest
         // value that fits in bucket 0. This is the bucket index since each successive bucket can
         // hold a value 2x greater. The mask maps small values to bucket 0.
-        (self.leading_zero_count_base - (value | self.sub_bucket_mask).leading_zeros() as u8)
+        self.leading_zero_count_base - (value | self.sub_bucket_mask).leading_zeros() as u8
     }
 
     #[inline]
-    /// Compute the position inside a bucket at which the given value should be recorded.
+    /// Compute the position inside a bucket at which the given value should be recorded, indexed
+    /// from position 0 in the bucket (in the first half, which is not used past the first bucket).
+    /// For bucket_index > 0, the result will be in the top half of the bucket.
     fn sub_bucket_for(&self, value: u64, bucket_index: u8) -> u32 {
-        // For bucket_index 0, this is just value, so it may be anywhere in 0 to sub_bucket_count. For
-        // other bucket_index, this will always end up in the top half of sub_bucket_count: assume
-        // that for some bucket k > 0, this calculation will yield a value in the bottom half of 0
-        // to sub_bucket_count. Then, because of how buckets overlap, it would have also been in the
-        // top half of bucket k-1, and therefore would have returned k-1 in bucket_index_of(). Since
-        // we would then shift it one fewer bits here, it would be twice as big, and therefore in
-        // the top half of sub_bucket_count.
-        // TODO: >>> ?
+        // Since bucket_index is simply how many powers of 2 greater value is than what will fit in
+        // bucket 0 (that is, what will fit in [0, sub_bucket_count)), we shift off that many
+        // powers of two, and end up with a number in [0, sub_bucket_count).
+        // For bucket_index 0, this is just value. For bucket index k > 0, we know value won't fit
+        // in bucket (k - 1) by definition, so this calculation won't end up in the lower half of
+        // [0, sub_bucket_count) because that would mean it would also fit in bucket (k - 1).
         (value >> (bucket_index + self.unit_magnitude)) as u32
     }
 
@@ -1286,6 +1282,7 @@ impl<T: Counter> Histogram<T> {
     ///
     /// May fail if `high` is not at least twice the lowest discernible value.
     fn cover(&mut self, high: u64) -> u32 {
+        // TODO validate before we get to here so we don't need an assert
         assert!(high >= 2 * self.lowest_discernible_value,
             "highest trackable value must be >= (2 * lowest discernible value)");