Skip to main content

mos_bib/
parser.rs

1//! Hand-rolled recursive-descent parser for the minimal BibTeX subset.
2//!
3//! The grammar is intentionally tiny:
4//!
5//! ```text
6//! bibtex := ws* (entry ws*)*
7//! entry  := '@' type '{' key (',' fields)? '}'
8//! fields := field (',' field)* ','?
9//! field  := name '=' value
10//! value  := '{' .. '}' | '"' .. '"' | bare
11//! ```
12//!
13//! Entry types and field names are lowercased; citation keys are kept
14//! verbatim. Brace values balance nested `{}` by naive counting, so
15//! `{The {LaTeX} Companion}` is captured whole, but their contents are
16//! stored as raw text; no `TeX` decoding, no `@string` / `@preamble` macro
17//! expansion, no `#` concatenation, no name parsing.
18
19use std::collections::BTreeMap;
20
21use crate::error::{BibParseError, BibParseErrorKind};
22use crate::record::{BibEntry, Bibliography};
23
24/// Parse `input` as a minimal BibTeX database.
25///
26/// Returns a [`Bibliography`] whose entries are keyed by citation key. A
27/// duplicate citation key is rejected so later resolver work can report it
28/// before any source-location context is lost. Parsing stops at the first
29/// malformed entry and returns a [`BibParseError`] pinpointing the byte offset;
30/// well-formed input never panics.
31///
32/// # Errors
33///
34/// Returns a [`BibParseError`] when the input is not a sequence of
35/// well-formed `@type{key, field = value, ...}` entries separated by
36/// whitespace: for example a missing `@`, entry type, `{`, citation key, or
37/// `=`, or an unterminated brace/quote value.
38///
39/// # Examples
40///
41/// ```
42/// use mos_bib::parse_bibtex;
43///
44/// # fn main() -> Result<(), mos_bib::BibParseError> {
45/// let bib = parse_bibtex("@article{rivest1978, author = {Ron Rivest}, year = 1978}")?;
46/// assert_eq!(bib.entries["rivest1978"].fields["year"], "1978");
47/// # Ok(())
48/// # }
49/// ```
50pub fn parse_bibtex(input: &str) -> Result<Bibliography, BibParseError> {
51    let mut parser = Parser::new(input);
52    let mut entries = BTreeMap::new();
53    parser.skip_whitespace();
54    while !parser.at_end() {
55        let parsed = parser.parse_entry()?;
56        if entries.contains_key(&parsed.entry.key) {
57            return Err(BibParseError::new(
58                BibParseErrorKind::DuplicateKey,
59                parsed.key_offset,
60            ));
61        }
62        entries.insert(parsed.entry.key.clone(), parsed.entry);
63        parser.skip_whitespace();
64    }
65    Ok(Bibliography { entries })
66}
67
68struct ParsedEntry {
69    entry: BibEntry,
70    key_offset: usize,
71}
72
73struct ParsedKey {
74    text: String,
75    offset: usize,
76}
77
78/// A byte cursor over the BibTeX source. All structural delimiters
79/// (`@ { } " , =`) and whitespace are ASCII, so scanning byte-by-byte never
80/// splits a multi-byte UTF-8 sequence and every recorded offset lands on a
81/// `char` boundary.
82struct Parser<'a> {
83    src: &'a str,
84    bytes: &'a [u8],
85    pos: usize,
86}
87
88impl<'a> Parser<'a> {
89    fn new(src: &'a str) -> Self {
90        Self {
91            src,
92            bytes: src.as_bytes(),
93            pos: 0,
94        }
95    }
96
97    fn at_end(&self) -> bool {
98        self.pos >= self.bytes.len()
99    }
100
101    fn peek(&self) -> Option<u8> {
102        self.bytes.get(self.pos).copied()
103    }
104
105    fn bump(&mut self) {
106        self.pos += 1;
107    }
108
109    fn skip_whitespace(&mut self) {
110        while let Some(b) = self.peek() {
111            if b.is_ascii_whitespace() {
112                self.bump();
113            } else {
114                break;
115            }
116        }
117    }
118
119    fn error_here(&self, kind: BibParseErrorKind) -> BibParseError {
120        BibParseError::new(kind, self.pos)
121    }
122
123    fn error_at(&self, offset: usize, kind: BibParseErrorKind) -> BibParseError {
124        BibParseError::new(kind, offset)
125    }
126
127    /// Consume `byte` if it is next; otherwise fail with `kind`.
128    fn expect_byte(&mut self, byte: u8, kind: BibParseErrorKind) -> Result<(), BibParseError> {
129        if self.peek() == Some(byte) {
130            self.bump();
131            Ok(())
132        } else {
133            Err(self.error_here(kind))
134        }
135    }
136
137    /// Consume a run of identifier bytes, returning the lowercased text.
138    /// Returns `None` (consuming nothing) when no identifier byte is next.
139    fn take_identifier(&mut self) -> Option<String> {
140        let start = self.pos;
141        while let Some(b) = self.peek() {
142            if is_identifier_byte(b) {
143                self.bump();
144            } else {
145                break;
146            }
147        }
148        if self.pos == start {
149            None
150        } else {
151            Some(self.src[start..self.pos].to_ascii_lowercase())
152        }
153    }
154
155    fn parse_entry(&mut self) -> Result<ParsedEntry, BibParseError> {
156        self.expect_byte(b'@', BibParseErrorKind::ExpectedAt)?;
157        self.skip_whitespace();
158        let entry_type = self
159            .take_identifier()
160            .ok_or_else(|| self.error_here(BibParseErrorKind::ExpectedEntryType))?;
161        self.skip_whitespace();
162        self.expect_byte(b'{', BibParseErrorKind::ExpectedOpenBrace)?;
163        self.skip_whitespace();
164        let key = self.parse_key()?;
165        self.skip_whitespace();
166        let mut fields = BTreeMap::new();
167        match self.peek() {
168            Some(b'}') => self.bump(),
169            Some(b',') => {
170                self.bump();
171                self.parse_fields(&mut fields)?;
172            }
173            Some(_) => return Err(self.error_here(BibParseErrorKind::ExpectedCommaOrCloseBrace)),
174            None => return Err(self.error_here(BibParseErrorKind::UnterminatedEntry)),
175        }
176        let key_span = key.offset..key.offset + key.text.len();
177        Ok(ParsedEntry {
178            entry: BibEntry {
179                entry_type,
180                key: key.text,
181                key_span,
182                fields,
183            },
184            key_offset: key.offset,
185        })
186    }
187
188    /// A citation key runs verbatim until a structural delimiter or
189    /// whitespace. It must be non-empty.
190    fn parse_key(&mut self) -> Result<ParsedKey, BibParseError> {
191        let start = self.pos;
192        while let Some(b) = self.peek() {
193            if is_key_byte(b) {
194                self.bump();
195            } else {
196                break;
197            }
198        }
199        if self.pos == start {
200            return Err(self.error_here(BibParseErrorKind::ExpectedKey));
201        }
202        Ok(ParsedKey {
203            text: self.src[start..self.pos].to_owned(),
204            offset: start,
205        })
206    }
207
208    /// Parse the comma-separated field list up to and including the closing
209    /// `}`. At least one field is required after the key's comma, so
210    /// `@type{key,}` is rejected; a trailing comma *after* a field is accepted.
211    fn parse_fields(&mut self, fields: &mut BTreeMap<String, String>) -> Result<(), BibParseError> {
212        let mut saw_field = false;
213        loop {
214            self.skip_whitespace();
215            match self.peek() {
216                // A `}` ends the list. After the key's comma we still owe a
217                // field, so `@type{key,}` (no field yet) is rejected; once a
218                // field has been seen this is the normal / trailing-comma end.
219                Some(b'}') if saw_field => {
220                    self.bump();
221                    return Ok(());
222                }
223                Some(b'}') => return Err(self.error_here(BibParseErrorKind::ExpectedFieldName)),
224                None => return Err(self.error_here(BibParseErrorKind::UnterminatedEntry)),
225                _ => {}
226            }
227            let name = self
228                .take_identifier()
229                .ok_or_else(|| self.error_here(BibParseErrorKind::ExpectedFieldName))?;
230            self.skip_whitespace();
231            self.expect_byte(b'=', BibParseErrorKind::ExpectedEquals)?;
232            self.skip_whitespace();
233            let value = self.parse_value()?;
234            // Last field wins on a repeated (post-lowercasing) field name.
235            fields.insert(name, value);
236            saw_field = true;
237            self.skip_whitespace();
238            match self.peek() {
239                Some(b',') => self.bump(),
240                Some(b'}') => {
241                    self.bump();
242                    return Ok(());
243                }
244                None => return Err(self.error_here(BibParseErrorKind::UnterminatedEntry)),
245                Some(_) => {
246                    return Err(self.error_here(BibParseErrorKind::ExpectedCommaOrCloseBrace));
247                }
248            }
249        }
250    }
251
252    fn parse_value(&mut self) -> Result<String, BibParseError> {
253        match self.peek() {
254            Some(b'{') => self.parse_braced(),
255            Some(b'"') => self.parse_quoted(),
256            Some(b) if is_bare_value_byte(b) => Ok(self.take_bare_value()),
257            _ => Err(self.error_here(BibParseErrorKind::ExpectedValue)),
258        }
259    }
260
261    /// Capture a `{...}` value, balancing nested braces by naive counting.
262    /// The inner text is returned verbatim, braces and all.
263    fn parse_braced(&mut self) -> Result<String, BibParseError> {
264        let open_offset = self.pos;
265        self.bump(); // consume '{'
266        let content_start = self.pos;
267        let mut depth = 1_usize;
268        while let Some(b) = self.peek() {
269            match b {
270                b'{' => depth += 1,
271                b'}' => {
272                    depth -= 1;
273                    if depth == 0 {
274                        let value = self.src[content_start..self.pos].to_owned();
275                        self.bump(); // consume closing '}'
276                        return Ok(value);
277                    }
278                }
279                _ => {}
280            }
281            self.bump();
282        }
283        Err(self.error_at(open_offset, BibParseErrorKind::UnterminatedValue))
284    }
285
286    /// Capture a `"..."` value, reading to the next unescaped `"` outside
287    /// braced TeX groups. This still stores raw text: the brace tracking only
288    /// keeps common quoted TeX accents like `{\"o}` from ending the value.
289    fn parse_quoted(&mut self) -> Result<String, BibParseError> {
290        let open_offset = self.pos;
291        self.bump(); // consume opening '"'
292        let content_start = self.pos;
293        let mut depth = 0_usize;
294        while let Some(b) = self.peek() {
295            match b {
296                b'\\' => {
297                    self.bump();
298                    if !self.at_end() {
299                        self.bump();
300                    }
301                    continue;
302                }
303                b'{' => depth += 1,
304                b'}' if depth > 0 => depth -= 1,
305                b'"' if depth == 0 => {
306                    let value = self.src[content_start..self.pos].to_owned();
307                    self.bump(); // consume closing '"'
308                    return Ok(value);
309                }
310                _ => {}
311            }
312            self.bump();
313        }
314        Err(self.error_at(open_offset, BibParseErrorKind::UnterminatedValue))
315    }
316
317    /// Capture an unquoted value (e.g. `1984`) as a single token. The caller
318    /// has already confirmed the first byte is a bare-value byte, so the
319    /// result is non-empty. `@string` macros are not resolved.
320    fn take_bare_value(&mut self) -> String {
321        let start = self.pos;
322        while let Some(b) = self.peek() {
323            if is_bare_value_byte(b) {
324                self.bump();
325            } else {
326                break;
327            }
328        }
329        self.src[start..self.pos].to_owned()
330    }
331}
332
333/// Bytes allowed in an entry type or field name.
334fn is_identifier_byte(b: u8) -> bool {
335    b.is_ascii_alphanumeric() || matches!(b, b'_' | b'-' | b'+' | b'.' | b':' | b'/')
336}
337
338/// Bytes allowed in a citation key: anything but a structural delimiter or
339/// whitespace.
340fn is_key_byte(b: u8) -> bool {
341    !b.is_ascii_whitespace() && !matches!(b, b',' | b'{' | b'}' | b'"' | b'=' | b'@')
342}
343
344/// Bytes allowed in a bare (unquoted, unbraced) value.
345fn is_bare_value_byte(b: u8) -> bool {
346    b.is_ascii_alphanumeric() || matches!(b, b'_' | b'-' | b'+' | b'.' | b':' | b'/')
347}