Skip to main content

mos_core/
hash.rs

1//! Deterministic content hashing for cache boundaries (design note
2//! `docs/incremental-dependencies.md` §4, §5).
3//!
4//! Incremental builds compare *content boundaries*: "did these bytes change?"
5//!, and the §4 hash sketches all share one shape: `H(engine_version, …fields)`.
6//! [`ContentHasher`] is the one place that shape is implemented, so every
7//! boundary (bibliography sources, page layout output, future source/asset
8//! hashes per §9.4) folds its fields the same way instead of re-deriving FNV in
9//! each crate.
10//!
11//! # What it guarantees
12//!
13//! - **Engine-version stamped.** [`ContentHasher::new`] folds
14//!   `CARGO_PKG_VERSION` first, so bumping the engine invalidates every hash
15//!   (§5 rule 2). Callers cannot forget it.
16//! - **Portable & deterministic.** The state is FNV-1a over 128 bits: fully
17//!   specified, endianness-pinned (fixed little-endian widths), and independent
18//!   of pointer identity or hash-map order. Unlike [`std`]'s randomly-seeded
19//!   `SipHash`, which §4 rules out, two runs of the same input always agree.
20//! - **Unambiguous framing.** [`field`](ContentHasher::field) length-prefixes
21//!   its bytes, so concatenated variable-length fields cannot collide across a
22//!   different split. Fixed-width numbers ([`u32`](ContentHasher::u32)) need no
23//!   prefix.
24//!
25//! # Interim hasher
26//!
27//! FNV-1a is an **interim** choice: the design note prefers
28//! BLAKE3-truncated-to-128, and the §9.4 slice may swap the construction. That
29//! swap stays internal; the [`ContentHasher`] API is unchanged, but it does
30//! change hash *values*, which the stamped engine version absorbs. FNV is not
31//! collision-hardened; nothing yet relies on adversarial collision resistance.
32
33/// Opaque content / dependency hash.
34///
35/// # Examples
36///
37/// ```
38/// use mos_core::ContentHash;
39///
40/// let hash = ContentHash::default();
41///
42/// assert_eq!(hash.0, 0);
43/// ```
44#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Default)]
45pub struct ContentHash(pub u128);
46
47/// Engine version stamped into every hash (§5 rule 2).
48const ENGINE_VERSION: &str = env!("CARGO_PKG_VERSION");
49
50/// FNV-1a 128-bit offset basis (FNV spec).
51const FNV_OFFSET_BASIS: u128 = 0x6c62_272e_07bb_0142_62b8_2175_6295_c58d;
52
53/// FNV-1a 128-bit prime (FNV spec): `2^88 + 2^8 + 0x3b`.
54const FNV_PRIME: u128 = 0x0000_0000_0100_0000_0000_0000_0000_013b;
55
56/// An incremental builder for a deterministic [`ContentHash`] over a sequence
57/// of typed fields (design note §4).
58///
59/// Construct with [`new`](Self::new) (which stamps the engine version), fold a
60/// domain tag plus the boundary's fields with [`field`](Self::field) /
61/// [`u32`](Self::u32), then read the result with [`finish`](Self::finish). The
62/// field sequence is the boundary's contract: keep it fixed per domain, and
63/// lead with a domain tag so two boundaries that fold identical bytes still
64/// differ.
65///
66/// # Examples
67///
68/// ```
69/// use mos_core::ContentHasher;
70///
71/// // Same fields in the same order hash equal; any change diverges.
72/// let mut a = ContentHasher::new();
73/// a.field(b"example/v1").u32(7).field(b"payload");
74///
75/// let mut b = ContentHasher::new();
76/// b.field(b"example/v1").u32(7).field(b"payload");
77///
78/// assert_eq!(a.finish(), b.finish());
79///
80/// let mut c = ContentHasher::new();
81/// c.field(b"example/v1").u32(8).field(b"payload");
82/// assert_ne!(a.finish(), c.finish());
83/// ```
84#[derive(Clone, Debug)]
85pub struct ContentHasher {
86    state: u128,
87}
88
89impl ContentHasher {
90    /// Start a hasher, stamping the engine version (§5 rule 2) so callers
91    /// cannot forget it.
92    ///
93    /// # Examples
94    ///
95    /// ```
96    /// use mos_core::ContentHasher;
97    ///
98    /// // A fresh hasher already carries the engine-version stamp, so two
99    /// // freshly-constructed hashers agree before any field is folded.
100    /// assert_eq!(ContentHasher::new().finish(), ContentHasher::new().finish());
101    /// ```
102    #[must_use]
103    pub fn new() -> Self {
104        let mut hasher = Self {
105            state: FNV_OFFSET_BASIS,
106        };
107        hasher.field(ENGINE_VERSION.as_bytes());
108        hasher
109    }
110
111    /// Fold one variable-length field, length-prefixed so field boundaries stay
112    /// unambiguous.
113    ///
114    /// The `u64` length prefix is fixed-width (not `usize`) so the hash is
115    /// identical on 32- and 64-bit targets.
116    ///
117    /// # Examples
118    ///
119    /// ```
120    /// use mos_core::ContentHasher;
121    ///
122    /// // Length framing keeps `("a", "bc")` distinct from `("ab", "c")`.
123    /// let mut split_a = ContentHasher::new();
124    /// split_a.field(b"a").field(b"bc");
125    /// let mut split_b = ContentHasher::new();
126    /// split_b.field(b"ab").field(b"c");
127    /// assert_ne!(split_a.finish(), split_b.finish());
128    /// ```
129    pub fn field(&mut self, bytes: &[u8]) -> &mut Self {
130        let len = u64::try_from(bytes.len()).unwrap_or(u64::MAX);
131        self.fold(&len.to_le_bytes());
132        self.fold(bytes);
133        self
134    }
135
136    /// Fold a fixed-width `u32` (little-endian). No length prefix is needed: the
137    /// width is constant, so the field boundary is implicit.
138    ///
139    /// # Examples
140    ///
141    /// ```
142    /// use mos_core::ContentHasher;
143    ///
144    /// let mut a = ContentHasher::new();
145    /// let mut b = ContentHasher::new();
146    /// a.u32(1);
147    /// b.u32(2);
148    /// assert_ne!(a.finish(), b.finish());
149    /// ```
150    pub fn u32(&mut self, value: u32) -> &mut Self {
151        self.fold(&value.to_le_bytes());
152        self
153    }
154
155    /// The accumulated [`ContentHash`].
156    ///
157    /// # Examples
158    ///
159    /// ```
160    /// use mos_core::{ContentHash, ContentHasher};
161    ///
162    /// let hash: ContentHash = ContentHasher::new().field(b"x").finish();
163    /// // It is a real 128-bit value, not the default.
164    /// assert_ne!(hash, ContentHash::default());
165    /// ```
166    #[must_use]
167    pub fn finish(&self) -> ContentHash {
168        ContentHash(self.state)
169    }
170
171    /// Fold raw bytes into the running FNV-1a state (XOR-then-multiply).
172    fn fold(&mut self, bytes: &[u8]) {
173        for &byte in bytes {
174            self.state ^= u128::from(byte);
175            self.state = self.state.wrapping_mul(FNV_PRIME);
176        }
177    }
178}
179
180impl Default for ContentHasher {
181    fn default() -> Self {
182        Self::new()
183    }
184}
185
186#[cfg(test)]
187mod tests {
188    use super::{ContentHash, ContentHasher};
189
190    #[test]
191    fn same_fields_hash_equal() {
192        let mut a = ContentHasher::new();
193        a.field(b"dom").u32(3).field(b"body");
194        let mut b = ContentHasher::new();
195        b.field(b"dom").u32(3).field(b"body");
196        assert_eq!(a.finish(), b.finish());
197    }
198
199    #[test]
200    fn any_field_change_diverges() {
201        let mut base = ContentHasher::new();
202        base.field(b"dom").u32(3).field(b"body");
203
204        let mut changed_num = ContentHasher::new();
205        changed_num.field(b"dom").u32(4).field(b"body");
206
207        let mut changed_bytes = ContentHasher::new();
208        changed_bytes.field(b"dom").u32(3).field(b"BODY");
209
210        assert_ne!(base.finish(), changed_num.finish());
211        assert_ne!(base.finish(), changed_bytes.finish());
212    }
213
214    #[test]
215    fn length_framing_separates_fields() {
216        let mut split_a = ContentHasher::new();
217        split_a.field(b"a").field(b"bc");
218        let mut split_b = ContentHasher::new();
219        split_b.field(b"ab").field(b"c");
220        assert_ne!(split_a.finish(), split_b.finish());
221    }
222
223    #[test]
224    fn engine_version_is_stamped() {
225        // A fresh hasher is not the raw FNV offset basis: new() folds the
226        // engine version, so finish() already differs from a zero hash.
227        assert_ne!(ContentHasher::new().finish(), ContentHash::default());
228    }
229
230    #[test]
231    fn fixed_width_u32_needs_no_length_prefix_to_stay_unambiguous() {
232        // Two u32s vs. one: distinct because the values differ, and the
233        // fixed-width framing means 1,2 never aliases 2,1.
234        let mut forward = ContentHasher::new();
235        forward.u32(1).u32(2);
236        let mut backward = ContentHasher::new();
237        backward.u32(2).u32(1);
238        assert_ne!(forward.finish(), backward.finish());
239    }
240}