about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--readme.md2
-rw-r--r--src/lib.rs188
-rw-r--r--tests/nyble_pages.rs22
3 files changed, 169 insertions, 43 deletions
diff --git a/readme.md b/readme.md
index 725a71a..d0fc3b9 100644
--- a/readme.md
+++ b/readme.md
@@ -1,6 +1,8 @@
 opinionated, standards non-compliant HTML parser meant to consume handwritten HTML.
 
 - all tags must close *(even `<br/>`, `<meta/>`, `<link/>`)*
+- tag names must be separated from the tag-body *(where the attributes go)* by a space character *(` `, 0x20)*
+- self-closing tags must have the closing `/` at the ver y end of the body *(directly before the `>`)*
 - no > in tags except at the end (not even in attributes)
 - inline `<script>` and `<style>` must have their closing-tag be
   first-of-line *(excluding whitespace)*
\ No newline at end of file
diff --git a/src/lib.rs b/src/lib.rs
index 26ec5dd..f1010ae 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -43,12 +43,14 @@ impl Html {
 			Tag {
 				name: root_tag.name.to_owned(),
 				body: None,
+				self_closing: root_tag.self_closing,
 				children: vec![],
 			}
 		} else {
 			Tag {
 				name: root_tag.name.into(),
 				body: Some(root_tag.body.to_owned()),
+				self_closing: root_tag.self_closing,
 				children: vec![],
 			}
 		};
@@ -65,11 +67,14 @@ impl Html {
 			};
 		}
 
+		println!("Looking for {}", root_tag.name);
+
 		loop {
 			// Special case <script> and <style>
 			if root_tag.name == "script" && tag.get_attribute("src").is_none()
 				|| root_tag.name == "style"
 			{
+				println!("\tparse special");
 				let special = Self::special_parse(rest.unwrap(), root_tag.name);
 
 				match special {
@@ -95,6 +100,7 @@ impl Html {
 			// Find the closing end of out root_tag
 			if let Some((parsed, remaining)) = Self::is_tag(rest.unwrap()) {
 				if parsed.closing && parsed.name == root_tag.name {
+					println!("\tclosed {}", parsed.name);
 					break Consumed {
 						node: Node::Tag(tag),
 						remaining,
@@ -165,43 +171,64 @@ impl Html {
 				};
 				let tag_innards = &raw[1..idx];
 
-				let close = tag_innards.starts_with('/');
-				let self_close_idx = {
-					let possible = &tag_innards[1..];
-					let last_whole_char = possible.rfind(|c: char| !c.is_ascii_whitespace()).map(|n| n + 1);
-					let last_slash = possible.rfind('/').map(|n| n + 1);
-					match (last_slash, last_whole_char) {
-						(Some(slash), Some(whole)) if slash == whole => {
-							Some(slash)
-						},
-						_ => None
+				// Tag is closing. Don't check for self-close
+				// as they cannot be on the same tag
+				if let Some(closing) = tag_innards.strip_prefix('/') {
+					match closing.find(' ') {
+						None => return Some((ParsedTag {
+							closing: true,
+							self_closing: false,
+							name: closing,
+							body: ""
+						}, rest)),
+						Some(idx) => {
+							let name = &closing[..idx];
+							let body = &closing[idx..];
+
+							return Some((ParsedTag{
+								closing: true,
+								self_closing: false,
+								name,
+								body
+							}, rest))
+						}
 					}
-				};
-				let self_close = self_close_idx.is_some();
-				// can't close and self_close
-				if close && self_close { return None; }
-
-				// clean the close from the raw string
-				let name_raw = if let Some(close_idx) = self_close_idx {
-					&tag_innards[..close_idx]
-				} else if close {
-					&tag_innards[1..]
-				} else {
-					tag_innards
-				};
+				}
 
-				let (name, body) = match name_raw.find(|c: char| c.is_ascii_whitespace()){
+				if let Some(closing) = tag_innards.strip_suffix('/') {
+					match closing.find(' ') {
+						None => return Some((ParsedTag {
+							closing: false,
+							self_closing: true,
+							name: closing,
+							body: ""
+						}, rest)),
+						Some(idx) => {
+							let name = &closing[..idx];
+							let body = &closing[idx+1..];
+
+							return Some((ParsedTag{
+								closing: false,
+								self_closing: true,
+								name,
+								body
+							}, rest))
+						}
+					}
+				}
+
+				let (name, body) = match tag_innards.find(' '){
 					None => {
-						(name_raw, "")
+						(tag_innards, "")
 					},
 					Some(idx) => {
-						(&name_raw[..idx], &name_raw[idx+1..])
+						(&tag_innards[..idx], &tag_innards[idx+1..])
 					}
 				};
 
 				Some((ParsedTag{
-					closing: close,
-					self_closing: self_close,
+					closing: false,
+					self_closing: false,
 					name,
 					body
 				}, rest))
@@ -242,6 +269,15 @@ impl Html {
 	}
 }
 
+impl fmt::Display for Html {
+	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+		for node in &self.nodes {
+			write!(f, "{node}")?;
+		}
+		Ok(())
+	}
+}
+
 struct Consumed<'a> {
 	node: Node,
 	remaining: Option<&'a str>,
@@ -263,21 +299,29 @@ pub enum Node {
 	Comment(String),
 }
 
+impl fmt::Display for Node {
+	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+		match self {
+			Self::Text(txt) => write!(f, "{txt}"),
+			Self::Tag(tag) => write!(f, "{tag}"),
+			Self::Comment(cmt) => write!(f, "<!--{cmt}-->"),
+		}
+	}
+}
+
 #[derive(Clone, Debug, PartialEq)]
 pub struct Tag {
 	pub name: String,
 	/// Everything inside the tag that's not it's name. Includes a
 	/// self-close if there is one.
 	pub body: Option<String>,
+	pub self_closing: bool,
 	pub children: Vec<Node>,
 }
 
 impl Tag {
 	pub fn self_closing(&self) -> bool {
-		self.body
-			.as_deref()
-			.map(|s| s.trim_end().ends_with('/'))
-			.unwrap_or(false)
+		self.self_closing
 	}
 
 	pub fn get_attribute<'a>(&'a self, key: &str) -> Option<&'a str> {
@@ -340,12 +384,41 @@ impl Tag {
 	}
 }
 
+impl fmt::Display for Tag {
+	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+		let Tag {
+			name,
+			body,
+			self_closing,
+			children,
+		} = self;
+
+		let formatted_body = if let Some(body) = body {
+			format!(" {body}")
+		} else {
+			String::from("")
+		};
+
+		if *self_closing {
+			// we ignore our children if we're self-closing.
+			write!(f, "<{name} {}/>", body.as_deref().unwrap_or_default())
+		} else {
+			write!(f, "<{name}{formatted_body}>")?;
+			for child in children {
+				write!(f, "{}", child)?;
+			}
+			write!(f, "</{name}>")
+		}
+	}
+}
+
 #[macro_export]
 macro_rules! tag {
 	($name:expr) => {
 		$crate::Node::Tag($crate::Tag {
 			name: String::from($name),
 			body: None,
+			self_closing: false,
 			children: vec![],
 		})
 	};
@@ -354,6 +427,7 @@ macro_rules! tag {
 		$crate::Node::Tag($crate::Tag {
 			name: String::from($name),
 			body: None,
+			self_closing: false,
 			children: vec![$($children),+],
 		})
 	};
@@ -362,6 +436,7 @@ macro_rules! tag {
 		$crate::Node::Tag($crate::Tag {
 			name: String::from($name),
 			body: Some(String::from($body)),
+			self_closing: false,
 			children: vec![],
 		})
 	};
@@ -370,6 +445,7 @@ macro_rules! tag {
 		$crate::Node::Tag($crate::Tag {
 			name: String::from($name),
 			body: Some(String::from($body)),
+			self_closing: false,
 			children: vec![$($children),+],
 		})
 	};
@@ -529,6 +605,39 @@ mod test {
 			)]
 		)
 	}
+
+	fn test_roundtrip(raw: &str) {
+		let html = Html::parse(raw);
+		let string = html.to_string();
+
+		for (raw, html) in raw.lines().zip(string.lines()) {
+			assert_eq!(raw, html)
+		}
+	}
+
+	#[test]
+	fn round_trip_simple() {
+		test_roundtrip("<p>Hello!</p>")
+	}
+
+	#[test]
+	fn round_trip_complex() {
+		test_roundtrip(
+			r#"
+<html>
+	<head>
+		<link rel="style.css"/>
+		<title>Title!</title>
+		<script>
+			alert("hello!");
+		</script>
+	</head>
+	<body>
+		<p>Hello, <i>World!</i></p>
+	</body>
+</html>"#,
+		)
+	}
 }
 
 #[cfg(test)]
@@ -540,6 +649,7 @@ mod tag_test {
 		let tag = Tag {
 			name: "div".into(),
 			body: Some("contenteditable".into()),
+			self_closing: false,
 			children: vec![],
 		};
 		assert!(tag.get_attribute("contenteditable").is_some())
@@ -550,26 +660,18 @@ mod tag_test {
 		let tag = Tag {
 			name: "script".into(),
 			body: Some("src=\"script.js\"".into()),
+			self_closing: false,
 			children: vec![],
 		};
 		assert_eq!(tag.get_attribute("src"), Some("script.js"))
 	}
 
 	#[test]
-	fn tag_finds_attribute_with_self_close() {
-		let tag = Tag {
-			name: "link".into(),
-			body: Some("href=\"style.css\" /".into()),
-			children: vec![],
-		};
-		assert_eq!(tag.get_attribute("href"), Some("style.css"))
-	}
-
-	#[test]
 	fn tag_finds_boolean_in_centre() {
 		let tag = Tag {
 			name: "div".into(),
-			body: Some("id=\"divy\" contenteditable style=\"display: none;\" /".into()),
+			body: Some("id=\"divy\" contenteditable style=\"display: none;\"".into()),
+			self_closing: false,
 			children: vec![],
 		};
 		assert!(tag.get_attribute("contenteditable").is_some());
diff --git a/tests/nyble_pages.rs b/tests/nyble_pages.rs
index 14ca1cc..2c99bc2 100644
--- a/tests/nyble_pages.rs
+++ b/tests/nyble_pages.rs
@@ -6,8 +6,30 @@ fn pares_homepage() {
 	let html = Html::parse(&str);
 }
 
+fn test_roundtrip(raw: &str) {
+	let html = Html::parse(raw);
+	let string = html.to_string();
+	for (ln, (raw, round)) in raw.lines().zip(string.lines()).enumerate() {
+		if raw != round {
+			panic!("line {ln} differs!\n\traw:  {raw}\n\ttrip: {round}")
+		}
+	}
+}
+
+#[test]
+fn homepage_roundtrip() {
+	let str = std::fs::read_to_string("tests/nyble.html").unwrap();
+	test_roundtrip(&str);
+}
+
 #[test]
 fn parses_grass() {
 	let str = std::fs::read_to_string("tests/touching_grass.html").unwrap();
 	let html = Html::parse(&str);
 }
+
+#[test]
+fn grass_roundtrip() {
+	let str = std::fs::read_to_string("tests/touching_grass.html").unwrap();
+	test_roundtrip(&str);
+}