Skip to main content

mos_bib/
content.rs

1//! Content-hash boundary for bibliography inputs (manifest §7, §32; design
2//! note `docs/incremental-dependencies.md` §4.1).
3//!
4//! A future incremental build needs to answer "did this `.bib` change?" before
5//! it can decide whether cached citation data is stale.
6//! [`bibliography_content_hash`] supplies the *boundary* half of that answer:
7//! it pins exactly which bytes feed a bibliography source's content hash, so
8//! two builds of the same file converge on the same [`ContentHash`] and any
9//! edit diverges. The path-shaped *identity* half lives next door in
10//! `mos_cache::DependencyId::Bibliography`, and the two are paired by
11//! `mos_cache::BibliographyDependency`.
12//!
13//! # Hash boundary (design note §4.1)
14//!
15//! ```text
16//! BibliographyContentHash = H(
17//!     engine_version,               // stamped by ContentHasher::new()
18//!     domain_tag,                   // distinguishes this boundary from other H(...)
19//!     file_bytes                    // raw bytes as read, byte-for-byte, no normalization
20//! )
21//! ```
22//!
23//! The bytes are hashed **raw**: no NFC, no line-ending fold, no BOM strip. That
24//! mirrors §4.1; the parser does not normalize source today, so the content
25//! hash must reflect what the parser actually consumed, or the cache would
26//! "forget" cosmetic edits the parser is sensitive to. Filesystem-derived data
27//! (mtime, inode, absolute path) is deliberately *not* an input.
28//!
29//! `H` is [`mos_core::ContentHasher`]; the shared, engine-version-stamped,
30//! length-framed FNV-1a-128 boundary hasher (interim; swappable to BLAKE3 per
31//! §9.4 without changing this `&[u8] -> ContentHash` signature). This boundary
32//! just supplies the domain tag and the raw bytes.
33
34use mos_core::{ContentHash, ContentHasher};
35
36/// Domain separator: keeps this boundary's hashes from colliding with any other
37/// `H(...)` boundary that happens to feed identical bytes. The trailing `/v1`
38/// versions the *framing*, independently of `engine_version`.
39const DOMAIN_TAG: &[u8] = b"mos-bib/bibliography-source/v1";
40
41/// Compute the content-hash boundary for one bibliography source's raw bytes.
42///
43/// The result is the §4.1 source hash specialized to bibliography inputs:
44/// deterministic for identical bytes, divergent for any byte change, and
45/// independent of where or when the file was read. Pair it with a
46/// `mos_cache::DependencyId::Bibliography` identity (typically via
47/// `mos_cache::BibliographyDependency`) to model a full bibliography
48/// dependency.
49///
50/// # Examples
51///
52/// Identical bytes hash equal; a one-byte edit diverges:
53///
54/// ```
55/// use mos_bib::bibliography_content_hash;
56///
57/// let a = bibliography_content_hash(b"@article{k, year = 1984}");
58/// let b = bibliography_content_hash(b"@article{k, year = 1984}");
59/// assert_eq!(a, b);
60/// assert_ne!(a, bibliography_content_hash(b"@article{k, year = 1985}"));
61/// ```
62///
63/// Hashing is byte-for-byte, so NFC- and NFD-encoded text that *looks* the same
64/// produces different hashes (the parser sees different bytes, §4.1):
65///
66/// ```
67/// use mos_bib::bibliography_content_hash;
68///
69/// // "é" composed (NFC) vs. "e" + combining acute (NFD).
70/// let nfc = bibliography_content_hash("@misc{r, note = {\u{00e9}}}".as_bytes());
71/// let nfd = bibliography_content_hash("@misc{r, note = {e\u{0301}}}".as_bytes());
72/// assert_ne!(nfc, nfd);
73/// ```
74#[must_use]
75pub fn bibliography_content_hash(bytes: &[u8]) -> ContentHash {
76    // ContentHasher::new() stamps engine_version (§5 rule 2); the domain tag and
77    // raw bytes follow, both length-framed. Field order mirrors the §4.1
78    // `SourceHash` shape: engine_version, kind/domain tag, raw bytes.
79    let mut hasher = ContentHasher::new();
80    hasher.field(DOMAIN_TAG).field(bytes);
81    hasher.finish()
82}
83
84#[cfg(test)]
85mod tests {
86    use super::bibliography_content_hash;
87
88    #[test]
89    fn identical_bytes_hash_equal() {
90        let source = b"@article{knuth1984, title = {Literate Programming}, year = 1984}";
91        assert_eq!(
92            bibliography_content_hash(source),
93            bibliography_content_hash(source)
94        );
95    }
96
97    #[test]
98    fn one_byte_change_diverges() {
99        assert_ne!(
100            bibliography_content_hash(b"@article{k, year = 1984}"),
101            bibliography_content_hash(b"@article{k, year = 1985}")
102        );
103    }
104
105    #[test]
106    fn hashing_is_byte_for_byte_not_normalized() {
107        // Composed (NFC) vs. decomposed (NFD) "é": same text, different bytes,
108        // therefore different hashes; the parser would see different bytes.
109        assert_ne!(
110            bibliography_content_hash("\u{00e9}".as_bytes()),
111            bibliography_content_hash("e\u{0301}".as_bytes())
112        );
113    }
114
115    #[test]
116    fn empty_input_has_a_stable_distinct_hash() {
117        assert_eq!(
118            bibliography_content_hash(b""),
119            bibliography_content_hash(b"")
120        );
121        assert_ne!(
122            bibliography_content_hash(b""),
123            bibliography_content_hash(b" ")
124        );
125    }
126}