diff --git a/src/reader/dtd.rs b/src/reader/dtd.rs new file mode 100644 index 00000000..53613f70 --- /dev/null +++ b/src/reader/dtd.rs @@ -0,0 +1,351 @@ +use std::num::NonZeroU8; + +#[derive(Debug, Clone, Copy, PartialEq)] +pub(super) enum DTDParseState { + /// If inside a PubidLiteral or SystemLiteral, it holds the quote type (either `'` or `"`). Otherwise, it holds `None`. + /// + /// ```text + /// [28] doctypedecl ::= '' + /// ``` + BeforeInternalSubset(Option), + InsideOfInternalSubset, + InComment, + InPI, + /// ```text + /// [45] elementdecl ::= '' + /// ``` + InElementDecl, + /// If inside a DefaultDecl's AttValue, it holds the quote type (either `'` or `"`). Otherwise, it holds `None`. + /// + /// ```text + /// [52] AttlistDecl ::= '' + /// [53] AttDef ::= S Name S AttType S DefaultDecl + /// [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue) + /// ``` + InAttlistDecl(Option), + /// If inside a PubidLiteral or SystemLiteral, it holds the quote type (either `'` or `"`). Otherwise, it holds `None`. + /// + /// ```text + /// [70] EntityDecl ::= GEDecl | PEDecl + /// [71] GEDecl ::= '' + /// [72] PEDecl ::= '' + /// [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?) + /// [74] PEDef ::= EntityValue | ExternalID + /// [75] ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral + /// [76] NDataDecl ::= S 'NDATA' S Name + /// ``` + InEntityDecl(Option), + /// If inside a PubidLiteral or SystemLiteral, it holds the quote type (either `'` or `"`). Otherwise, it holds `None`. + /// + /// ```text + /// [82] NotationDecl ::= '' + /// ``` + InNotationDecl(Option), + /// The state where it was not possible to determine which markup it was during the previous iteration. \ + /// It holds the number of bytes read since the start of the markup. + UndecidedMarkup(usize), + Finished, +} + +impl DTDParseState { + /// skip DTD contents. + /// + /// # Parameters (as same as `reader::BangType::parse`) + /// - `buf`: buffer with data consumed on previous iterations + /// - `chunk`: data read on current iteration and not yet consumed from reader + pub(super) fn feed<'b>(&mut self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> { + // This method assumes the DTD is well-formed. + // Since this crate does not support parsing DTDs, the inability to read non-well-formed DTDs + // is not particularly problematic; the only point of interest is reporting well-formed DTDs + // to the user without errors. + + let mut cur = chunk; + while !cur.is_empty() { + match *self { + DTDParseState::BeforeInternalSubset(quote) => { + if let Some(quote) = quote.map(|q| q.get()) { + // ExternalID handling + if let Some(i) = memchr::memchr(quote, cur) { + *self = DTDParseState::BeforeInternalSubset(None); + cur = &cur[i + 1..]; + continue; + } else { + break; + } + } else if let Some(i) = cur + .iter() + .position(|&b| matches!(b, b'\'' | b'"' | b'[' | b'>')) + { + let b = cur[i]; + match b { + b'\'' | b'"' => { + // system id or public id is found. + *self = DTDParseState::BeforeInternalSubset(Some( + b.try_into().unwrap(), + )); + cur = &cur[i + 1..]; + continue; + } + b'[' => { + *self = DTDParseState::InsideOfInternalSubset; + cur = &cur[i + 1..]; + continue; + } + b'>' => { + *self = DTDParseState::Finished; + let len = chunk.len() - cur.len() + i; + // +1 for `>` + return Some((&chunk[..len], len + 1)); + } + _ => {} + } + } else { + break; + } + } + DTDParseState::InsideOfInternalSubset => { + if chunk.len() == cur.len() && buf.ends_with(b"]") && cur.starts_with(b">") { + *self = DTDParseState::Finished; + return Some((&[], 1)); + } + if let Some(i) = memchr::memchr2(b']', b'<', cur) { + if cur[i] == b']' { + if cur.len() > i + 1 && cur[i + 1] == b'>' { + *self = DTDParseState::Finished; + let len = chunk.len() - cur.len() + i + 1; + return Some((&chunk[..len], len + 1)); // +1 for `>` + } else { + cur = &cur[i + 1..]; + } + } else { + match cur[i + 1..] { + [b'?', ..] => { + // { + // ") { + cur = &cur[i + 3..]; + *self = DTDParseState::InsideOfInternalSubset; + } else { + break; + } + } + DTDParseState::InPI => { + if chunk.len() == cur.len() { + if !buf.ends_with(b"") { + *self = DTDParseState::InsideOfInternalSubset; + cur = &cur[1..]; + } + } else if let Some(i) = cur.windows(2).position(|c| c == b"?>") { + *self = DTDParseState::InsideOfInternalSubset; + cur = &cur[i + 2..]; + } else { + break; + } + } + DTDParseState::InElementDecl => { + if let Some(i) = memchr::memchr(b'>', cur) { + cur = &cur[i + 1..]; + *self = DTDParseState::InsideOfInternalSubset; + continue; + } + break; + } + DTDParseState::InAttlistDecl(quote) => { + if let Some(quote) = quote.map(|q| q.get()) { + if let Some(i) = memchr::memchr(quote, cur) { + cur = &cur[i + 1..]; + *self = DTDParseState::InAttlistDecl(None); + } else { + break; + } + } else if let Some(i) = memchr::memchr3(b'\'', b'"', b'>', cur) { + match cur[i] { + b @ (b'\'' | b'"') => { + *self = DTDParseState::InAttlistDecl(b.try_into().ok()) + } + b'>' => *self = DTDParseState::InsideOfInternalSubset, + _ => {} + } + cur = &cur[i + 1..]; + } else { + break; + } + } + DTDParseState::InEntityDecl(quote) => { + if let Some(quote) = quote.map(|q| q.get()) { + if let Some(i) = memchr::memchr(quote, cur) { + cur = &cur[i + 1..]; + *self = DTDParseState::InEntityDecl(None); + } else { + break; + } + } else if let Some(i) = memchr::memchr3(b'\'', b'"', b'>', cur) { + match cur[i] { + b @ (b'\'' | b'"') => { + *self = DTDParseState::InEntityDecl(b.try_into().ok()) + } + b'>' => *self = DTDParseState::InsideOfInternalSubset, + _ => {} + } + cur = &cur[i + 1..]; + } else { + break; + } + } + DTDParseState::InNotationDecl(quote) => { + if let Some(quote) = quote.map(|q| q.get()) { + if let Some(i) = memchr::memchr(quote, cur) { + cur = &cur[i + 1..]; + *self = DTDParseState::InNotationDecl(None); + } else { + break; + } + } else if let Some(i) = memchr::memchr3(b'\'', b'"', b'>', cur) { + match cur[i] { + b @ (b'\'' | b'"') => { + *self = DTDParseState::InNotationDecl(b.try_into().ok()) + } + b'>' => *self = DTDParseState::InsideOfInternalSubset, + _ => {} + } + cur = &cur[i + 1..]; + } else { + break; + } + } + DTDParseState::UndecidedMarkup(ref mut offset) => { + if *offset > 10 { + if let Some(i) = memchr::memchr(b'>', cur) { + // Hmm... It's not ideal, but judging from the test cases in issue590, + // this crate seems designed not to report errors on incorrect markup + // declarations. So I'll go with ending the unknown markup with `>`. + cur = &cur[i + 1..]; + *self = DTDParseState::InsideOfInternalSubset; + continue; + } + // Since it cannot match any markup, it simply skips it until an error occurs. + *offset += cur.len(); + break; + } + + let mut bytes = [0u8; 9]; + let len = *offset - 1; + bytes[..len].copy_from_slice(&buf[buf.len() - len..]); + let end = bytes.len().min(len + cur.len()); + bytes[len..end].copy_from_slice(&cur[..end - len]); + + match bytes[..end] { + [b'?', ..] => { + // { + // Comment, /// . Contains balance of '<' (+1) and '>' (-1) - DocType(i32), + DocType(DTDParseState), } impl BangType { #[inline(always)] @@ -1165,7 +1167,7 @@ impl BangType { Ok(match byte { Some(b'[') => Self::CData, Some(b'-') => Self::Comment, - Some(b'D') | Some(b'd') => Self::DocType(0), + Some(b'D') | Some(b'd') => Self::DocType(DTDParseState::BeforeInternalSubset(None)), _ => return Err(SyntaxError::InvalidBangMarkup), }) } @@ -1222,18 +1224,7 @@ impl BangType { } } } - Self::DocType(ref mut balance) => { - for i in memchr::memchr2_iter(b'<', b'>', chunk) { - if chunk[i] == b'<' { - *balance += 1; - } else { - if *balance == 0 { - return Some((&chunk[..i], i + 1)); // +1 for `>` - } - *balance -= 1; - } - } - } + Self::DocType(ref mut state) => return state.feed(buf, chunk), } None } @@ -1242,7 +1233,7 @@ impl BangType { match self { Self::CData => SyntaxError::UnclosedCData, Self::Comment => SyntaxError::UnclosedComment, - Self::DocType(_) => SyntaxError::UnclosedDoctype, + Self::DocType { .. } => SyntaxError::UnclosedDoctype, } } } @@ -1266,7 +1257,7 @@ mod test { mod read_bang_element { use super::*; use crate::errors::{Error, SyntaxError}; - use crate::reader::BangType; + use crate::reader::{BangType, DTDParseState}; use crate::utils::Bytes; /// Checks that reading CDATA content works correctly @@ -1552,7 +1543,7 @@ mod test { .unwrap(); assert_eq!( (ty, Bytes(bytes)), - (BangType::DocType(0), Bytes(b"!DOCTYPE")) + (BangType::DocType(DTDParseState::Finished), Bytes(b"!DOCTYPE")) ); assert_eq!(position, 10); } @@ -1626,7 +1617,7 @@ mod test { .unwrap(); assert_eq!( (ty, Bytes(bytes)), - (BangType::DocType(0), Bytes(b"!doctype")) + (BangType::DocType(DTDParseState::Finished), Bytes(b"!doctype")) ); assert_eq!(position, 10); } diff --git a/src/reader/state.rs b/src/reader/state.rs index 656e57cc..d986b336 100644 --- a/src/reader/state.rs +++ b/src/reader/state.rs @@ -6,7 +6,7 @@ use crate::errors::{Error, IllFormedError, Result, SyntaxError}; use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesPI, BytesStart, BytesText, Event}; #[cfg(feature = "encoding")] use crate::reader::EncodingRef; -use crate::reader::{BangType, Config, ParseState}; +use crate::reader::{BangType, Config, DTDParseState, ParseState}; use crate::utils::{is_whitespace, name_len}; /// A struct that holds a current reader state and a parser configuration. @@ -144,7 +144,7 @@ impl ReaderState { // https://www.w3.org/TR/xml11/#sec-prolog-dtd // HTML5 allows mixed case for doctype declarations: // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state - BangType::DocType(0) if uncased_starts_with(buf, b"!DOCTYPE") => { + BangType::DocType(DTDParseState::Finished) if uncased_starts_with(buf, b"!DOCTYPE") => { match buf[8..].iter().position(|&b| !is_whitespace(b)) { Some(start) => Ok(Event::DocType(BytesText::wrap( // Cut of `!DOCTYPE` and any number of spaces from start diff --git a/tests/issues.rs b/tests/issues.rs index 916ba7ff..6d31cf13 100644 --- a/tests/issues.rs +++ b/tests/issues.rs @@ -512,3 +512,63 @@ fn issue801() { } } } + +/// Regression test for https://github.com/tafia/quick-xml/issues/923 +mod issue923 { + use super::*; + + fn run_test(xml: &str) { + let mut reader = Reader::from_str(xml); + let mut buf = vec![]; + loop { + match reader.read_event_into(&mut buf) { + Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e), + Ok(Event::Eof) => break, + Ok(_) => {} + } + } + } + + #[test] + fn close_angled_bracket_in_entity_declaratin() { + const XML: &str = r#"">]>"#; + run_test(XML); + } + + #[test] + fn open_angled_bracket_in_entity_declaration() { + const XML: &str = r#"]>"#; + run_test(XML); + } + + #[test] + fn close_angled_bracket_in_attlist_declaration() { + const XML: &str = r#" 2 is true" att2 #FIXED '>>> in other quote'>]>"#; + run_test(XML); + } + + #[test] + fn close_angled_bracket_in_notation_declaration() { + const XML: &str = r#">>some_system_id"'>]>"#; + run_test(XML); + } + + #[test] + fn open_and_close_angled_bracket_in_comment() { + const XML: &str = + r#"]>"#; + run_test(XML); + } + + #[test] + fn open_and_close_angled_bracket_in_pi_data() { + const XML: &str = r#"><<>><><>?>]>"#; + run_test(XML); + } + + #[test] + fn issue258() { + const XML: &str = r#"'>'>?>-->]>"#; + run_test(XML); + } +}