Skip to content
351 changes: 351 additions & 0 deletions src/reader/dtd.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,351 @@
use std::num::NonZeroU8;

#[derive(Debug, Clone, Copy, PartialEq)]
pub(super) enum DTDParseState {
/// If inside a PubidLiteral or SystemLiteral, it holds the quote type (either `'` or `"`). Otherwise, it holds `None`.
///
/// ```text
/// [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
/// ```
BeforeInternalSubset(Option<NonZeroU8>),
InsideOfInternalSubset,
InComment,
InPI,
/// ```text
/// [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>'
/// ```
InElementDecl,
/// If inside a DefaultDecl's AttValue, it holds the quote type (either `'` or `"`). Otherwise, it holds `None`.
///
/// ```text
/// [52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
/// [53] AttDef ::= S Name S AttType S DefaultDecl
/// [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue)
/// ```
InAttlistDecl(Option<NonZeroU8>),
/// If inside a PubidLiteral or SystemLiteral, it holds the quote type (either `'` or `"`). Otherwise, it holds `None`.
///
/// ```text
/// [70] EntityDecl ::= GEDecl | PEDecl
/// [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
/// [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
/// [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
/// [74] PEDef ::= EntityValue | ExternalID
/// [75] ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
/// [76] NDataDecl ::= S 'NDATA' S Name
/// ```
InEntityDecl(Option<NonZeroU8>),
/// If inside a PubidLiteral or SystemLiteral, it holds the quote type (either `'` or `"`). Otherwise, it holds `None`.
///
/// ```text
/// [82] NotationDecl ::= '<!NOTATION' S Name S (ExternalID | PublicID) S? '>'
/// ```
InNotationDecl(Option<NonZeroU8>),
/// The state where it was not possible to determine which markup it was during the previous iteration. \
/// It holds the number of bytes read since the start of the markup.
UndecidedMarkup(usize),
Finished,
}

impl DTDParseState {
/// skip DTD contents.
///
/// # Parameters (as same as `reader::BangType::parse`)
/// - `buf`: buffer with data consumed on previous iterations
/// - `chunk`: data read on current iteration and not yet consumed from reader
pub(super) fn feed<'b>(&mut self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
// This method assumes the DTD is well-formed.
// Since this crate does not support parsing DTDs, the inability to read non-well-formed DTDs
// is not particularly problematic; the only point of interest is reporting well-formed DTDs
// to the user without errors.

let mut cur = chunk;
while !cur.is_empty() {
match *self {
DTDParseState::BeforeInternalSubset(quote) => {
if let Some(quote) = quote.map(|q| q.get()) {
// ExternalID handling
if let Some(i) = memchr::memchr(quote, cur) {
*self = DTDParseState::BeforeInternalSubset(None);
cur = &cur[i + 1..];
continue;
} else {
break;
}
} else if let Some(i) = cur
.iter()
.position(|&b| matches!(b, b'\'' | b'"' | b'[' | b'>'))
{
let b = cur[i];
match b {
b'\'' | b'"' => {
// system id or public id is found.
*self = DTDParseState::BeforeInternalSubset(Some(
b.try_into().unwrap(),
));
cur = &cur[i + 1..];
continue;
}
b'[' => {
*self = DTDParseState::InsideOfInternalSubset;
cur = &cur[i + 1..];
continue;
}
b'>' => {
*self = DTDParseState::Finished;
let len = chunk.len() - cur.len() + i;
// +1 for `>`
return Some((&chunk[..len], len + 1));
}
_ => {}
}
} else {
break;
}
}
DTDParseState::InsideOfInternalSubset => {
if chunk.len() == cur.len() && buf.ends_with(b"]") && cur.starts_with(b">") {
*self = DTDParseState::Finished;
return Some((&[], 1));
}
if let Some(i) = memchr::memchr2(b']', b'<', cur) {
if cur[i] == b']' {
if cur.len() > i + 1 && cur[i + 1] == b'>' {
*self = DTDParseState::Finished;
let len = chunk.len() - cur.len() + i + 1;
return Some((&chunk[..len], len + 1)); // +1 for `>`
} else {
cur = &cur[i + 1..];
}
} else {
match cur[i + 1..] {
[b'?', ..] => {
// <?
*self = DTDParseState::InPI;
cur = &cur[i + 2..];
}
[b'!', b'-', b'-', ..] => {
// <!--
*self = DTDParseState::InComment;
cur = &cur[i + 4..];
}
[b'!', b'E', b'L', b'E', b'M', b'E', b'N', b'T', ..] => {
// <!ELEMENT
*self = DTDParseState::InElementDecl;
cur = &cur[i + 9..];
}
[b'!', b'E', b'N', b'T', b'I', b'T', b'Y', ..] => {
// <!ENTITY
*self = DTDParseState::InEntityDecl(None);
cur = &cur[i + 8..];
}
[b'!', b'A', b'T', b'T', b'L', b'I', b'S', b'T', ..] => {
// <!ATTLIST
*self = DTDParseState::InAttlistDecl(None);
cur = &cur[i + 9..];
}
[b'!', b'N', b'O', b'T', b'A', b'T', b'I', b'O', b'N', ..] => {
// <!NOTATION
*self = DTDParseState::InNotationDecl(None);
cur = &cur[i + 10..];
}
_ => {
// Hmm... It's not ideal, but judging from the test cases in issue590,
// this crate seems designed not to report errors on incorrect markup
// declarations. So I'll go with ending the unknown markup with `>`.
if let Some(j) = memchr::memchr(b'>', &cur[i + 1..]) {
*self = DTDParseState::InsideOfInternalSubset;
cur = &cur[i + 1 + j + 1..];
} else {
let offset = cur.len() - i;
*self = DTDParseState::UndecidedMarkup(offset);
break;
}
}
}
}
} else {
break;
}
}
DTDParseState::InComment => {
if chunk.len() == cur.len() {
match buf {
[.., b'<', b'!', b'-', b'-'] => {}
[.., b'-', b'-'] if chunk.starts_with(b">") => {
cur = &cur[1..];
*self = DTDParseState::InsideOfInternalSubset;
continue;
}
[.., b'-'] if chunk.starts_with(b"->") => {
cur = &cur[2..];
*self = DTDParseState::InsideOfInternalSubset;
continue;
}
[..] => {}
}
}
if let Some(i) = cur.windows(3).position(|c| c == b"-->") {
cur = &cur[i + 3..];
*self = DTDParseState::InsideOfInternalSubset;
} else {
break;
}
}
DTDParseState::InPI => {
if chunk.len() == cur.len() {
if !buf.ends_with(b"<?") && buf.ends_with(b"?") && cur.starts_with(b">") {
*self = DTDParseState::InsideOfInternalSubset;
cur = &cur[1..];
}
} else if let Some(i) = cur.windows(2).position(|c| c == b"?>") {
*self = DTDParseState::InsideOfInternalSubset;
cur = &cur[i + 2..];
} else {
break;
}
}
DTDParseState::InElementDecl => {
if let Some(i) = memchr::memchr(b'>', cur) {
cur = &cur[i + 1..];
*self = DTDParseState::InsideOfInternalSubset;
continue;
}
break;
}
DTDParseState::InAttlistDecl(quote) => {
if let Some(quote) = quote.map(|q| q.get()) {
if let Some(i) = memchr::memchr(quote, cur) {
cur = &cur[i + 1..];
*self = DTDParseState::InAttlistDecl(None);
} else {
break;
}
} else if let Some(i) = memchr::memchr3(b'\'', b'"', b'>', cur) {
match cur[i] {
b @ (b'\'' | b'"') => {
*self = DTDParseState::InAttlistDecl(b.try_into().ok())
}
b'>' => *self = DTDParseState::InsideOfInternalSubset,
_ => {}
}
cur = &cur[i + 1..];
} else {
break;
}
}
DTDParseState::InEntityDecl(quote) => {
if let Some(quote) = quote.map(|q| q.get()) {
if let Some(i) = memchr::memchr(quote, cur) {
cur = &cur[i + 1..];
*self = DTDParseState::InEntityDecl(None);
} else {
break;
}
} else if let Some(i) = memchr::memchr3(b'\'', b'"', b'>', cur) {
match cur[i] {
b @ (b'\'' | b'"') => {
*self = DTDParseState::InEntityDecl(b.try_into().ok())
}
b'>' => *self = DTDParseState::InsideOfInternalSubset,
_ => {}
}
cur = &cur[i + 1..];
} else {
break;
}
}
DTDParseState::InNotationDecl(quote) => {
if let Some(quote) = quote.map(|q| q.get()) {
if let Some(i) = memchr::memchr(quote, cur) {
cur = &cur[i + 1..];
*self = DTDParseState::InNotationDecl(None);
} else {
break;
}
} else if let Some(i) = memchr::memchr3(b'\'', b'"', b'>', cur) {
match cur[i] {
b @ (b'\'' | b'"') => {
*self = DTDParseState::InNotationDecl(b.try_into().ok())
}
b'>' => *self = DTDParseState::InsideOfInternalSubset,
_ => {}
}
cur = &cur[i + 1..];
} else {
break;
}
}
DTDParseState::UndecidedMarkup(ref mut offset) => {
if *offset > 10 {
if let Some(i) = memchr::memchr(b'>', cur) {
// Hmm... It's not ideal, but judging from the test cases in issue590,
// this crate seems designed not to report errors on incorrect markup
// declarations. So I'll go with ending the unknown markup with `>`.
cur = &cur[i + 1..];
*self = DTDParseState::InsideOfInternalSubset;
continue;
}
// Since it cannot match any markup, it simply skips it until an error occurs.
*offset += cur.len();
break;
}

let mut bytes = [0u8; 9];
let len = *offset - 1;
bytes[..len].copy_from_slice(&buf[buf.len() - len..]);
let end = bytes.len().min(len + cur.len());
bytes[len..end].copy_from_slice(&cur[..end - len]);

match bytes[..end] {
[b'?', ..] => {
// <?
*self = DTDParseState::InPI;
cur = &cur[2 - len..];
}
[b'!', b'-', b'-', ..] => {
// <!--
*self = DTDParseState::InComment;
cur = &cur[3 - len..];
}
[b'!', b'E', b'L', b'E', b'M', b'E', b'N', b'T', ..] => {
// <!ELEMENT
*self = DTDParseState::InElementDecl;
cur = &cur[8 - len..];
}
[b'!', b'E', b'N', b'T', b'I', b'T', b'Y', ..] => {
// <!ENTITY
*self = DTDParseState::InEntityDecl(None);
cur = &cur[7 - len..];
}
[b'!', b'A', b'T', b'T', b'L', b'I', b'S', b'T', ..] => {
// <!ATTLIST
*self = DTDParseState::InAttlistDecl(None);
cur = &cur[8 - len..];
}
[b'!', b'N', b'O', b'T', b'A', b'T', b'I', b'O', b'N', ..] => {
// <!NOTATION
*self = DTDParseState::InNotationDecl(None);
cur = &cur[9 - len..];
}
_ => {
// Hmm... It's not ideal, but judging from the test cases in issue590,
// this crate seems designed not to report errors on incorrect markup
// declarations. So I'll go with ending the unknown markup with `>`.
if let Some(i) = memchr::memchr(b'>', cur) {
*self = DTDParseState::InsideOfInternalSubset;
cur = &cur[i + 1..];
} else {
*offset += cur.len();
break;
}
}
}
}
DTDParseState::Finished => break,
}
}

None
}
}
Loading