mos_core/hash.rs
1//! Deterministic content hashing for cache boundaries (design note
2//! `docs/incremental-dependencies.md` §4, §5).
3//!
4//! Incremental builds compare *content boundaries*: "did these bytes change?"
5//!, and the §4 hash sketches all share one shape: `H(engine_version, …fields)`.
6//! [`ContentHasher`] is the one place that shape is implemented, so every
7//! boundary (bibliography sources, page layout output, future source/asset
8//! hashes per §9.4) folds its fields the same way instead of re-deriving FNV in
9//! each crate.
10//!
11//! # What it guarantees
12//!
13//! - **Engine-version stamped.** [`ContentHasher::new`] folds
14//! `CARGO_PKG_VERSION` first, so bumping the engine invalidates every hash
15//! (§5 rule 2). Callers cannot forget it.
16//! - **Portable & deterministic.** The state is FNV-1a over 128 bits: fully
17//! specified, endianness-pinned (fixed little-endian widths), and independent
18//! of pointer identity or hash-map order. Unlike [`std`]'s randomly-seeded
19//! `SipHash`, which §4 rules out, two runs of the same input always agree.
20//! - **Unambiguous framing.** [`field`](ContentHasher::field) length-prefixes
21//! its bytes, so concatenated variable-length fields cannot collide across a
22//! different split. Fixed-width numbers ([`u32`](ContentHasher::u32)) need no
23//! prefix.
24//!
25//! # Interim hasher
26//!
27//! FNV-1a is an **interim** choice: the design note prefers
28//! BLAKE3-truncated-to-128, and the §9.4 slice may swap the construction. That
29//! swap stays internal; the [`ContentHasher`] API is unchanged, but it does
30//! change hash *values*, which the stamped engine version absorbs. FNV is not
31//! collision-hardened; nothing yet relies on adversarial collision resistance.
32
33/// Opaque content / dependency hash.
34///
35/// # Examples
36///
37/// ```
38/// use mos_core::ContentHash;
39///
40/// let hash = ContentHash::default();
41///
42/// assert_eq!(hash.0, 0);
43/// ```
44#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Default)]
45pub struct ContentHash(pub u128);
46
47/// Engine version stamped into every hash (§5 rule 2).
48const ENGINE_VERSION: &str = env!("CARGO_PKG_VERSION");
49
50/// FNV-1a 128-bit offset basis (FNV spec).
51const FNV_OFFSET_BASIS: u128 = 0x6c62_272e_07bb_0142_62b8_2175_6295_c58d;
52
53/// FNV-1a 128-bit prime (FNV spec): `2^88 + 2^8 + 0x3b`.
54const FNV_PRIME: u128 = 0x0000_0000_0100_0000_0000_0000_0000_013b;
55
56/// An incremental builder for a deterministic [`ContentHash`] over a sequence
57/// of typed fields (design note §4).
58///
59/// Construct with [`new`](Self::new) (which stamps the engine version), fold a
60/// domain tag plus the boundary's fields with [`field`](Self::field) /
61/// [`u32`](Self::u32), then read the result with [`finish`](Self::finish). The
62/// field sequence is the boundary's contract: keep it fixed per domain, and
63/// lead with a domain tag so two boundaries that fold identical bytes still
64/// differ.
65///
66/// # Examples
67///
68/// ```
69/// use mos_core::ContentHasher;
70///
71/// // Same fields in the same order hash equal; any change diverges.
72/// let mut a = ContentHasher::new();
73/// a.field(b"example/v1").u32(7).field(b"payload");
74///
75/// let mut b = ContentHasher::new();
76/// b.field(b"example/v1").u32(7).field(b"payload");
77///
78/// assert_eq!(a.finish(), b.finish());
79///
80/// let mut c = ContentHasher::new();
81/// c.field(b"example/v1").u32(8).field(b"payload");
82/// assert_ne!(a.finish(), c.finish());
83/// ```
84#[derive(Clone, Debug)]
85pub struct ContentHasher {
86 state: u128,
87}
88
89impl ContentHasher {
90 /// Start a hasher, stamping the engine version (§5 rule 2) so callers
91 /// cannot forget it.
92 ///
93 /// # Examples
94 ///
95 /// ```
96 /// use mos_core::ContentHasher;
97 ///
98 /// // A fresh hasher already carries the engine-version stamp, so two
99 /// // freshly-constructed hashers agree before any field is folded.
100 /// assert_eq!(ContentHasher::new().finish(), ContentHasher::new().finish());
101 /// ```
102 #[must_use]
103 pub fn new() -> Self {
104 let mut hasher = Self {
105 state: FNV_OFFSET_BASIS,
106 };
107 hasher.field(ENGINE_VERSION.as_bytes());
108 hasher
109 }
110
111 /// Fold one variable-length field, length-prefixed so field boundaries stay
112 /// unambiguous.
113 ///
114 /// The `u64` length prefix is fixed-width (not `usize`) so the hash is
115 /// identical on 32- and 64-bit targets.
116 ///
117 /// # Examples
118 ///
119 /// ```
120 /// use mos_core::ContentHasher;
121 ///
122 /// // Length framing keeps `("a", "bc")` distinct from `("ab", "c")`.
123 /// let mut split_a = ContentHasher::new();
124 /// split_a.field(b"a").field(b"bc");
125 /// let mut split_b = ContentHasher::new();
126 /// split_b.field(b"ab").field(b"c");
127 /// assert_ne!(split_a.finish(), split_b.finish());
128 /// ```
129 pub fn field(&mut self, bytes: &[u8]) -> &mut Self {
130 let len = u64::try_from(bytes.len()).unwrap_or(u64::MAX);
131 self.fold(&len.to_le_bytes());
132 self.fold(bytes);
133 self
134 }
135
136 /// Fold a fixed-width `u32` (little-endian). No length prefix is needed: the
137 /// width is constant, so the field boundary is implicit.
138 ///
139 /// # Examples
140 ///
141 /// ```
142 /// use mos_core::ContentHasher;
143 ///
144 /// let mut a = ContentHasher::new();
145 /// let mut b = ContentHasher::new();
146 /// a.u32(1);
147 /// b.u32(2);
148 /// assert_ne!(a.finish(), b.finish());
149 /// ```
150 pub fn u32(&mut self, value: u32) -> &mut Self {
151 self.fold(&value.to_le_bytes());
152 self
153 }
154
155 /// The accumulated [`ContentHash`].
156 ///
157 /// # Examples
158 ///
159 /// ```
160 /// use mos_core::{ContentHash, ContentHasher};
161 ///
162 /// let hash: ContentHash = ContentHasher::new().field(b"x").finish();
163 /// // It is a real 128-bit value, not the default.
164 /// assert_ne!(hash, ContentHash::default());
165 /// ```
166 #[must_use]
167 pub fn finish(&self) -> ContentHash {
168 ContentHash(self.state)
169 }
170
171 /// Fold raw bytes into the running FNV-1a state (XOR-then-multiply).
172 fn fold(&mut self, bytes: &[u8]) {
173 for &byte in bytes {
174 self.state ^= u128::from(byte);
175 self.state = self.state.wrapping_mul(FNV_PRIME);
176 }
177 }
178}
179
180impl Default for ContentHasher {
181 fn default() -> Self {
182 Self::new()
183 }
184}
185
186#[cfg(test)]
187mod tests {
188 use super::{ContentHash, ContentHasher};
189
190 #[test]
191 fn same_fields_hash_equal() {
192 let mut a = ContentHasher::new();
193 a.field(b"dom").u32(3).field(b"body");
194 let mut b = ContentHasher::new();
195 b.field(b"dom").u32(3).field(b"body");
196 assert_eq!(a.finish(), b.finish());
197 }
198
199 #[test]
200 fn any_field_change_diverges() {
201 let mut base = ContentHasher::new();
202 base.field(b"dom").u32(3).field(b"body");
203
204 let mut changed_num = ContentHasher::new();
205 changed_num.field(b"dom").u32(4).field(b"body");
206
207 let mut changed_bytes = ContentHasher::new();
208 changed_bytes.field(b"dom").u32(3).field(b"BODY");
209
210 assert_ne!(base.finish(), changed_num.finish());
211 assert_ne!(base.finish(), changed_bytes.finish());
212 }
213
214 #[test]
215 fn length_framing_separates_fields() {
216 let mut split_a = ContentHasher::new();
217 split_a.field(b"a").field(b"bc");
218 let mut split_b = ContentHasher::new();
219 split_b.field(b"ab").field(b"c");
220 assert_ne!(split_a.finish(), split_b.finish());
221 }
222
223 #[test]
224 fn engine_version_is_stamped() {
225 // A fresh hasher is not the raw FNV offset basis: new() folds the
226 // engine version, so finish() already differs from a zero hash.
227 assert_ne!(ContentHasher::new().finish(), ContentHash::default());
228 }
229
230 #[test]
231 fn fixed_width_u32_needs_no_length_prefix_to_stay_unambiguous() {
232 // Two u32s vs. one: distinct because the values differ, and the
233 // fixed-width framing means 1,2 never aliases 2,1.
234 let mut forward = ContentHasher::new();
235 forward.u32(1).u32(2);
236 let mut backward = ContentHasher::new();
237 backward.u32(2).u32(1);
238 assert_ne!(forward.finish(), backward.finish());
239 }
240}