Skip to main content

mos_eval/
bibliography.rs

1//! Lower `#bibliography(...)` directives into a semantic node.
2//!
3//! This is the *source boundary* only (manifest ยง4, MVP 4): the directive
4//! declares one bibliography database path, which is resolved relative to
5//! the current `.mos` source file and stashed on a
6//! [`NodeKind::Bibliography`] node. After lowering, citation keys are checked
7//! against the parsed BibTeX records from those declared sources. Rendering
8//! citation markers and bibliography entries is still a later slice.
9//!
10//! Diagnostics:
11//!
12//! - `MOS0040`: `#bibliography(...)` called without a (non-empty) path.
13//! - `MOS0042`: `#bibliography(...)` declared more than one path; first wins.
14//! - `MOS0020`: the path argument is present but not a string.
15//! - `MOS0015`: an unknown keyword argument was supplied.
16//! - `MOS0041`: the resolved path does not point to a file on disk
17//!   (a warning; the node is still emitted with its resolved path).
18//! - `MOS0045`: a citation key does not exist in a complete parsed bibliography set.
19//! - `MOS0046`: a citation key appears in more than one declared bibliography source.
20
21use std::collections::{BTreeMap, BTreeSet};
22use std::fs;
23use std::path::{Path, PathBuf};
24
25use mos_bib::Bibliography;
26use mos_core::{
27    AttrMap, AttrValue, Diagnostic, Document, NodeId, NodeKind, NodeSpec, SourceSpan, Suggestion,
28    codes,
29};
30use mos_parse::{SetArg, SetValue};
31
32/// Lower a top-level `#bibliography("refs.bib")` directive into a single
33/// [`NodeKind::Bibliography`] node hanging off the document root. The
34/// literal path is recorded under `src`; the path resolved against the
35/// source file's directory is recorded under `resolved_path` so the
36/// later BibTeX reader can open the database without re-deriving the
37/// location.
38pub(super) fn lower_bibliography_directive(
39    document: &mut Document,
40    root: NodeId,
41    args: &[SetArg],
42    span: &SourceSpan,
43    source_file: &Path,
44    diagnostics: &mut Vec<Diagnostic>,
45) {
46    let Some(path) = bibliography_path(args, span, diagnostics) else {
47        return;
48    };
49    let resolved = match mos_core::resolve_source_path(&path, source_file) {
50        Ok(resolved) => resolved,
51        Err(err) => {
52            diagnostics.push(
53                Diagnostic::simple(
54                    &codes::MOS0049,
55                    None,
56                    format!("cannot use bibliography path `{path}`: {err}"),
57                )
58                .with_span(span.clone()),
59            );
60            return;
61        }
62    };
63    // The directive only *declares* the source in this slice, so a missing
64    // file is a non-fatal warning rather than the hard error `#image(...)`
65    // raises: the node is still emitted with its resolved path, and the
66    // BibTeX-reading slice surfaces a read/parse error when it opens the
67    // database for real.
68    if !resolved.is_file() {
69        diagnostics.push(
70            Diagnostic::simple(
71                &codes::MOS0041,
72                None,
73                format!(
74                    "declared bibliography source `{}` was not found",
75                    mos_core::display_path(&resolved)
76                ),
77            )
78            .with_span(span.clone()),
79        );
80    }
81    let mut attributes: AttrMap = BTreeMap::new();
82    attributes.insert("src".to_owned(), AttrValue::Str(path));
83    attributes.insert(
84        "resolved_path".to_owned(),
85        AttrValue::Str(resolved.to_string_lossy().into_owned()),
86    );
87    document.alloc_child(
88        root,
89        NodeSpec::new(NodeKind::Bibliography, span.clone()).with_attributes(attributes),
90    );
91}
92
93/// Pull the single source path out of the directive arguments. A leading
94/// positional string (`#bibliography("refs.bib")`) or the named
95/// `path:`/`src:` forms are accepted, mirroring `#image(...)`. Returns
96/// `None` (after emitting a diagnostic) when the path is missing, empty,
97/// or not a string.
98fn bibliography_path(
99    args: &[SetArg],
100    span: &SourceSpan,
101    diagnostics: &mut Vec<Diagnostic>,
102) -> Option<String> {
103    let mut path: Option<String> = None;
104    let mut invalid_path_arg = false;
105    for arg in args {
106        match arg {
107            // Positional first arg -- the path literal, same shorthand
108            // `#image("path.png")` accepts.
109            SetArg::Positional { value, value_span } => {
110                if let SetValue::Str(s) = value {
111                    if path.is_some() {
112                        diagnostics.push(
113                            Diagnostic::simple(
114                                &codes::MOS0042,
115                                None,
116                                "duplicate path argument for `#bibliography`",
117                            )
118                            .with_span(value_span.clone()),
119                        );
120                    } else {
121                        path = Some(s.clone());
122                    }
123                } else {
124                    invalid_path_arg = true;
125                    diagnostics.push(
126                        Diagnostic::simple(
127                            &codes::MOS0020,
128                            None,
129                            "`#bibliography(...)` expects a string path",
130                        )
131                        .with_span(value_span.clone()),
132                    );
133                }
134            }
135            SetArg::Named {
136                key,
137                value,
138                key_span,
139                value_span,
140            } => match key.as_str() {
141                "src" | "path" => {
142                    if let SetValue::Str(s) = value {
143                        if path.is_some() {
144                            diagnostics.push(
145                                Diagnostic::simple(
146                                    &codes::MOS0042,
147                                    None,
148                                    "duplicate path argument for `#bibliography`",
149                                )
150                                .with_span(value_span.clone()),
151                            );
152                        } else {
153                            path = Some(s.clone());
154                        }
155                    } else {
156                        invalid_path_arg = true;
157                        diagnostics.push(
158                            Diagnostic::simple(
159                                &codes::MOS0020,
160                                None,
161                                "`#bibliography(...)` expects a string path",
162                            )
163                            .with_span(value_span.clone()),
164                        );
165                    }
166                }
167                _ => diagnostics.push(
168                    Diagnostic::simple(
169                        &codes::MOS0015,
170                        None,
171                        format!("unknown argument `{key}` for `#bibliography` (valid: src/path)"),
172                    )
173                    .with_span(key_span.clone()),
174                ),
175            },
176        }
177    }
178    let Some(path) = path else {
179        if invalid_path_arg {
180            return None;
181        }
182        diagnostics.push(
183            Diagnostic::simple(
184                &codes::MOS0040,
185                None,
186                "`#bibliography(...)` requires a path (e.g. `#bibliography(\"refs.bib\")`)",
187            )
188            .with_span(span.clone()),
189        );
190        return None;
191    };
192    // A bare empty / whitespace-only path is the same mistake as omitting
193    // it -- they wrote `#bibliography("")` and meant to fill in a filename.
194    if path.trim().is_empty() {
195        diagnostics.push(
196            Diagnostic::simple(
197                &codes::MOS0040,
198                None,
199                "`#bibliography(...)` requires a non-empty path (e.g. `#bibliography(\"refs.bib\")`)",
200            )
201            .with_span(span.clone()),
202        );
203        return None;
204    }
205    Some(path)
206}
207
208/// Load every declared bibliography source, mark citation nodes whose keys
209/// exist in any parsed record set, and rewrite their visible text to a
210/// numeric label assigned by first-use order. Unknown citation keys emit
211/// `MOS0045` once per citation node and keep their `[?key?]` placeholder.
212///
213/// Numbering is dense over *known* citations: a key consumes a number only
214/// when it resolves, and repeated uses of the same key reuse its first
215/// number. Unresolved keys never burn a slot, so `[1]`, `[2]`, ... always
216/// index real bibliography records. This is the numeric-placeholder slice
217/// (issue #67), not full CSL: no author-year styles, sorted output, or
218/// citation clusters.
219/// Resolve `[@key]` citations and return the set of keys declared by the
220/// loaded bibliography sources. The reference resolver consumes that set to
221/// tell an `@key` label reference that *misses* the label index but *matches*
222/// a bibliography key apart -- a near-certain "meant a citation" mistake --
223/// from a plain unknown label (see [`crate::resolve::resolve`]).
224pub(super) fn resolve_citations(
225    document: &mut Document,
226    diagnostics: &mut Vec<Diagnostic>,
227) -> BTreeSet<String> {
228    let bibliography = load_bibliography(document, diagnostics);
229    let citation_ids: Vec<NodeId> = document
230        .nodes()
231        .filter(|node| node.kind == NodeKind::Citation)
232        .map(|node| node.id)
233        .collect();
234
235    // `nodes()` walks the `BTreeMap<NodeId, Node>` in `NodeId` order, which is
236    // the lowerer's allocation order -- i.e. document order. Collecting the ids
237    // above preserves that order, so the first new key encountered here is the
238    // document's first-cited key.
239    let mut numbers: BTreeMap<String, usize> = BTreeMap::new();
240
241    for citation_id in citation_ids {
242        let Some(node) = document.get(citation_id) else {
243            continue;
244        };
245        let Some(AttrValue::Str(key)) = node.attributes.get("key").cloned() else {
246            continue;
247        };
248        if bibliography.records.entries.contains_key(&key) {
249            let next_number = numbers.len() + 1;
250            let number = *numbers.entry(key.clone()).or_insert(next_number);
251            if let Some(node) = document.get_mut(citation_id) {
252                node.attributes
253                    .insert("resolved".to_owned(), AttrValue::Bool(true));
254                node.attributes
255                    .insert("text".to_owned(), AttrValue::Str(format!("[{number}]")));
256                if let Some(origin) = bibliography.origins.get(&key) {
257                    node.attributes.insert(
258                        "target_path".to_owned(),
259                        AttrValue::Str(origin.path.to_string_lossy().into_owned()),
260                    );
261                    if let (Ok(start), Ok(end)) = (
262                        i64::try_from(origin.key_span.start()),
263                        i64::try_from(origin.key_span.end()),
264                    ) {
265                        node.attributes
266                            .insert("target_span.start".to_owned(), AttrValue::Int(start));
267                        node.attributes
268                            .insert("target_span.end".to_owned(), AttrValue::Int(end));
269                    }
270                }
271            }
272            continue;
273        }
274        if !bibliography.complete {
275            continue;
276        }
277        let mut diagnostic = Diagnostic::simple(
278            &codes::MOS0045,
279            Some(node.span.clone()),
280            format!("unknown citation key `{key}` in bibliography records"),
281        )
282        .with_annotation(mos_core::DiagnosticAnnotation::Hint(
283            "declare the key in a `#bibliography(...)` BibTeX source".to_owned(),
284        ));
285        if let Some(candidate) = nearest_citation_key(&key, &bibliography.records.entries)
286            && let Some(span) = citation_key_span(node, &key)
287        {
288            diagnostic = diagnostic.with_suggestion(Suggestion::new(span, candidate));
289        }
290        diagnostics.push(diagnostic);
291    }
292
293    bibliography.records.entries.keys().cloned().collect()
294}
295
296fn citation_key_span(node: &mos_core::Node, key: &str) -> Option<SourceSpan> {
297    let start = node.span.start().checked_add(2)?;
298    let end = start.checked_add(key.len())?;
299    (end < node.span.end()).then(|| SourceSpan::new(node.span.file.clone(), start, end))
300}
301
302fn is_citation_key(key: &str) -> bool {
303    !key.is_empty()
304        && key
305            .bytes()
306            .all(|b| b.is_ascii_alphanumeric() || matches!(b, b'_' | b'-' | b':' | b'.'))
307}
308
309fn edit_distance(a: &str, b: &str) -> usize {
310    let b = b.as_bytes();
311    let mut row: Vec<usize> = (0..=b.len()).collect();
312    for (i, &ai) in a.as_bytes().iter().enumerate() {
313        let mut diag = row[0];
314        row[0] = i + 1;
315        for (j, &bj) in b.iter().enumerate() {
316            let cost = usize::from(ai != bj);
317            let sub = diag + cost;
318            diag = row[j + 1];
319            row[j + 1] = sub.min(row[j + 1] + 1).min(row[j] + 1);
320        }
321    }
322    row[b.len()]
323}
324
325fn nearest_citation_key(
326    unknown: &str,
327    records: &BTreeMap<String, mos_bib::BibEntry>,
328) -> Option<String> {
329    if unknown.len() < 3 {
330        return None;
331    }
332    let max_distance = unknown.len() / 3;
333    let mut best: Option<(usize, &str)> = None;
334    let mut tied = false;
335    for key in records.keys().filter(|key| is_citation_key(key)) {
336        let distance = edit_distance(unknown, key);
337        if distance > max_distance {
338            continue;
339        }
340        match best {
341            None => {
342                best = Some((distance, key.as_str()));
343                tied = false;
344            }
345            Some((best_distance, _)) if distance < best_distance => {
346                best = Some((distance, key.as_str()));
347                tied = false;
348            }
349            Some((best_distance, _)) if distance == best_distance => tied = true,
350            Some(_) => {}
351        }
352    }
353    let (_, key) = best?;
354    (!tied).then(|| key.to_owned())
355}
356
357struct LoadedBibliography {
358    records: Bibliography,
359    origins: BTreeMap<String, BibliographyOrigin>,
360    complete: bool,
361}
362
363struct BibliographyOrigin {
364    path: PathBuf,
365    key_span: SourceSpan,
366}
367
368fn load_bibliography(document: &Document, diagnostics: &mut Vec<Diagnostic>) -> LoadedBibliography {
369    let mut merged = Bibliography::default();
370    let mut origins: BTreeMap<String, BibliographyOrigin> = BTreeMap::new();
371    let mut complete = true;
372    for node in document
373        .nodes()
374        .filter(|node| node.kind == NodeKind::Bibliography)
375    {
376        let Some(AttrValue::Str(path)) = node.attributes.get("resolved_path") else {
377            complete = false;
378            continue;
379        };
380        let path_buf = PathBuf::from(path);
381        if !path_buf.is_file() {
382            complete = false;
383            continue;
384        }
385        let source = match fs::read_to_string(&path_buf) {
386            Ok(source) => source,
387            Err(err) => {
388                complete = false;
389                diagnostics.push(Diagnostic::simple(
390                    &codes::MOS0041,
391                    Some(node.span.clone()),
392                    format!(
393                        "declared bibliography source `{}` could not be read: {err}",
394                        mos_core::display_path(&path_buf)
395                    ),
396                ));
397                continue;
398            }
399        };
400        match mos_bib::parse_bibtex(&source) {
401            Ok(parsed) => {
402                for (key, entry) in parsed.entries {
403                    let key_span =
404                        SourceSpan::new(path_buf.clone(), entry.key_span.start, entry.key_span.end);
405                    if let Some(first) = origins.get(&key) {
406                        diagnostics.push(
407                            Diagnostic::simple(
408                                &codes::MOS0046,
409                                Some(node.span.clone()),
410                                format!(
411                                    "duplicate citation key `{key}` in bibliography source `{}`",
412                                    mos_core::display_path(&path_buf)
413                                ),
414                            )
415                            .with_annotation(mos_core::DiagnosticAnnotation::Related {
416                                span: first.key_span.clone(),
417                                message: format!(
418                                    "first bibliography source for `{key}` was `{}`",
419                                    mos_core::display_path(&first.path)
420                                ),
421                            })
422                            .with_annotation(mos_core::DiagnosticAnnotation::Hint(
423                                "keep citation keys unique across all declared bibliography sources"
424                                    .to_owned(),
425                            )),
426                        );
427                    } else {
428                        origins.insert(
429                            key.clone(),
430                            BibliographyOrigin {
431                                path: path_buf.clone(),
432                                key_span,
433                            },
434                        );
435                        merged.entries.insert(key, entry);
436                    }
437                }
438            }
439            Err(err) => {
440                complete = false;
441                diagnostics.push(err.to_diagnostic(path_buf));
442            }
443        }
444    }
445    LoadedBibliography {
446        records: merged,
447        origins,
448        complete,
449    }
450}