From 0eefb6b385e78b142f691833c856c1cda01f20a0 Mon Sep 17 00:00:00 2001 From: gennyble Date: Wed, 27 Dec 2023 05:34:40 -0600 Subject: parses --- .gitignore | 1 + .rustfmt.toml | 1 + Cargo.lock | 7 ++ Cargo.toml | 9 ++ LICENSE | 15 +++ readme.md | 4 + src/lib.rs | 361 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ tests/homepage.rs | 7 ++ tests/nyble.html | 99 +++++++++++++++ 9 files changed, 504 insertions(+) create mode 100644 .gitignore create mode 100644 .rustfmt.toml create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 LICENSE create mode 100644 readme.md create mode 100644 src/lib.rs create mode 100644 tests/homepage.rs create mode 100644 tests/nyble.html diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/.rustfmt.toml b/.rustfmt.toml new file mode 100644 index 0000000..218e203 --- /dev/null +++ b/.rustfmt.toml @@ -0,0 +1 @@ +hard_tabs = true diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..0006c5b --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "cutie" +version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..f8822b4 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "cutie" +version = "0.1.0" +edition = "2021" +license = "ISC" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..8a55383 --- /dev/null +++ b/LICENSE @@ -0,0 +1,15 @@ +ISC License + +Copyright 2023 gennyble + +Permission to use, copy, modify, and/or distribute this software for +any purposewith or without fee is hereby granted, provided that the +above copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL +WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES +OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE +FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY +DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER +IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING +OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. \ No newline at end of file diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..89585a8 --- /dev/null +++ b/readme.md @@ -0,0 +1,4 @@ +opinionated, standards non-compliant HTML parser meant to consume handwritten HTML. + +- all tags must close +- no > in tags except at the end (not even in attributes) \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..e0c8c5b --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,361 @@ +pub struct Html { + pub nodes: Vec, +} + +impl Html { + pub fn parse>(raw: S) -> Self { + let mut raw = raw.as_ref(); + + let mut nodes = vec![]; + + loop { + let Consumed { node, remaining } = Self::parse_node(raw); + + nodes.push(node); + + match remaining { + None => break Self { nodes }, + Some(rem) => raw = rem, + } + } + } + + fn parse_node(raw: &str) -> Consumed { + match Self::is_tag(raw) { + Some(_) => match Self::parse_comment(raw) { + None => { + print!("Node "); + Self::parse_tag(raw) + } + Some(cmt) => cmt, + }, + None => { + print!("Text "); + let cons = Self::parse_text(raw); + println!("## {:?}", cons.node); + cons + } + } + } + + fn parse_tag(raw: &str) -> Consumed { + let (root_tag, mut rest) = Self::is_tag(raw).unwrap(); + + //println!("- {raw}"); + if root_tag.closing { + panic!( + "found closing tag when not expected! {:?}\n{raw}", + root_tag.name + ) + } else if root_tag.self_closing { + println!("self close return early"); + return Consumed { + node: Node::Tag { + self_closing: true, + name: root_tag.name.into(), + children: vec![], + }, + remaining: rest, + }; + } + + let mut children = vec![]; + + loop { + match Self::is_tag(rest.unwrap()) { + Some(( + ParsedTag { + closing: true, + name, + self_closing: false, + .. + }, + remaining, + )) if name == root_tag.name => { + println!("ret closed - {name}"); + break Consumed { + node: Node::Tag { + self_closing: false, + name: root_tag.name.to_owned(), + children, + }, + remaining, + }; + } + _ => { + println!("recur. ends on {}", root_tag.name,); + let cons = Self::parse_node(rest.unwrap()); + rest = cons.remaining; + children.push(cons.node); + } + } + } + } + + fn parse_comment(raw: &str) -> Option { + if raw.starts_with("") { + None => None, + Some(end) => { + let comment = &after_start[..end]; + let rest = after_start.get(end + 3..); + + Some(Consumed { + node: Node::Comment { + body: comment.into(), + }, + remaining: rest, + }) + } + } + } else { + None + } + } + + /// check if the start of the string is a valid tag + #[rustfmt::skip] + fn is_tag(raw: &str) -> Option<(ParsedTag, Option<&str>)> { + // Starts '<' and any non-whitespace character + let starts_right = raw.starts_with('<') + && raw.chars().nth(1).map(|c| !c.is_ascii_whitespace()).unwrap_or(false); + + if !starts_right { + return None; + } + + match raw.find('>') { + // not a tag if there's no close + None => None, + Some(idx) => { + let rest = match raw.get(idx+1..) { + None => None, + Some("") => None, + Some(txt) => Some(txt) + }; + let tag_innards = &raw[1..idx]; + + let close = tag_innards.starts_with('/'); + let self_close_idx = { + let possible = &tag_innards[1..]; + let last_whole_char = possible.rfind(|c: char| !c.is_ascii_whitespace()).map(|n| n + 1); + let last_slash = possible.rfind('/').map(|n| n + 1); + match (last_slash, last_whole_char) { + (Some(slash), Some(whole)) if slash == whole => { + Some(slash) + }, + _ => None + } + }; + let self_close = self_close_idx.is_some(); + // can't close and self_close + if close && self_close { return None; } + + // clean the close from the raw string + let name_raw = if let Some(close_idx) = self_close_idx { + &tag_innards[..close_idx] + } else if close { + &tag_innards[1..] + } else { + tag_innards + }; + + let (name, body) = match name_raw.find(|c: char| c.is_ascii_whitespace()){ + None => { + (name_raw, "") + }, + Some(idx) => { + (&name_raw[..idx], &name_raw[idx+1..]) + } + }; + + Some((ParsedTag{ + closing: close, + self_closing: self_close, + name, + body + }, rest)) + } + } + } + + fn parse_text(raw: &str) -> Consumed { + let mut end_idx = 0; + let mut search_from = raw; + + loop { + match search_from.find('<') { + // if we ever run out of <'s, the entire string was text + None => { + break Consumed { + node: Node::Text(raw.to_owned()), + remaining: None, + } + } + Some(idx) => { + end_idx += idx; + + if Self::is_tag(&search_from[idx..]).is_some() { + // we've found a new tag, this text node is done + break Consumed { + node: Node::Text(raw[..end_idx].to_owned()), + remaining: Some(&raw[end_idx..]), + }; + } else { + // step over the < + end_idx += 1; + search_from = &raw[end_idx..]; + } + } + } + } + } +} + +struct Consumed<'a> { + node: Node, + remaining: Option<&'a str>, +} + +struct ParsedTag<'a> { + closing: bool, + name: &'a str, + // a tag's body is what exists between the end of the name and the end of + // the tag (including a self-close that may be there and any whitespace) + body: &'a str, + self_closing: bool, +} + +impl<'a> ParsedTag<'a> { + /// Whether or not this tag closes or self-closes + pub fn closes(&self) -> bool { + self.closing || self.self_closing + } +} + +#[derive(Debug, PartialEq)] +pub enum Node { + Text(String), + Tag { + // for roundtripping + self_closing: bool, + name: String, + children: Vec, + }, + Comment { + body: String, + }, +} + +#[macro_export] +macro_rules! text { + ($text:expr) => { + Node::Text(String::from($text)) + }; +} + +#[cfg(test)] +mod test { + use crate::{Html, Node}; + + macro_rules! text { + ($text:expr) => { + Node::Text(String::from($text)) + }; + } + + #[test] + fn parse_text_finds_start_of_tag() { + let no_tag = "Hello, World!"; + let starts_tag = "

Hello, World!"; + let ends_tag = "Hello, World!

"; + + let no_tag_res = Html::parse_text(no_tag); + assert_eq!(no_tag_res.node, text!("Hello, World!")); + assert!(no_tag_res.remaining.is_none()); + + let starts_tag_res = Html::parse_text(starts_tag); + assert_eq!(starts_tag_res.node, text!("")); + assert_eq!(starts_tag_res.remaining, Some(starts_tag)); + + let ends_tag_res = Html::parse_text(ends_tag); + assert_eq!(ends_tag_res.node, text!("Hello, World!")); + assert_eq!(ends_tag_res.remaining, Some("

")); + } + + #[test] + fn parse_text_correctly_ignores_nontags() { + let sentence = "The condition 2 < 1 should be 1 > 2"; + let weird = "Hello, < p>"; + let no_close = "Hello

+ + + + + + + nyble.dev + + + + + +

+ +
+ + + myself with my head tilted back, arm outstretched holding a hat. There's a large amount of colorful noise covering my body. + + +
+

Contact~,.

+ +
+ +
+

Hello,

+

+ welcome to this small place. in the great vastness of the internet you've found your way here, thanks. +

+

+ i'm some kind of software person trying to make things. written things, visual things, + interesting things. i enjoy image processing and small, intentional tools. +

+

+ while i work out structuring here better, these are some pages you might like to visit +

+
    +
  1. sillygifs.html – some strange GIF files
  2. +
  3. touching-grass.html – a short video and a long over-analysis +
  4. +
+ +

+ You can subscribe to the Atom feed if you'd like. It gets updated if I + write anything and also if I make something here that I want + to show :) +

+ +
+

Scrap; dropped things collected

+
    +
  • aug 7, 2023 wrench!
  • +
+
+ + + + + + + + +
+
+ + + + + \ No newline at end of file -- cgit 1.4.1-3-g733a5