From 262b67cec4ebf0c90261c771f968e6e3231bc565 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Thu, 7 Mar 2024 17:31:41 +0100 Subject: [PATCH 01/11] fix(all): rename folders --- Cargo.toml | 4 +- README.md | 4 +- exclusion/build/group.rs | 178 ------------- exclusion/build/mod.rs | 146 ----------- exclusion/build/split.rs | 15 -- inclusion/build/auto.rs | 101 -------- inclusion/build/entry.rs | 268 -------------------- inclusion/build/index.rs | 180 ------------- inclusion/build/inner.rs | 84 ------ inclusion/build/mod.rs | 44 ---- inclusion/build/plain.rs | 224 ---------------- {exclusion => robotxt}/Cargo.toml | 0 {exclusion => robotxt}/README.md | 0 {exclusion => robotxt}/lib.rs | 0 {exclusion => robotxt}/parse/access.rs | 0 {exclusion => robotxt}/parse/inner.rs | 0 {exclusion => robotxt}/parse/lexer.rs | 0 {exclusion => robotxt}/parse/mod.rs | 0 {exclusion => robotxt}/parse/parser.rs | 0 {exclusion => robotxt}/parse/rule.rs | 0 {exclusion => robotxt}/parse/serde.rs | 0 {exclusion => robotxt}/paths/create.rs | 0 {exclusion => robotxt}/paths/mod.rs | 0 {exclusion => robotxt}/paths/normal.rs | 0 {inclusion => sitemapo}/Cargo.toml | 0 {inclusion => sitemapo}/README.md | 0 {inclusion => sitemapo}/lib.rs | 0 {inclusion => sitemapo}/parse/auto.rs | 0 {inclusion => sitemapo}/parse/entry.rs | 0 {inclusion => sitemapo}/parse/index.rs | 0 {inclusion => sitemapo}/parse/inner.rs | 0 {inclusion => sitemapo}/parse/mod.rs | 0 {inclusion => sitemapo}/parse/plain.rs | 0 {inclusion => sitemapo}/record/entry.rs | 0 {inclusion => sitemapo}/record/frequency.rs | 0 {inclusion => sitemapo}/record/index.rs | 0 {inclusion => sitemapo}/record/mod.rs | 0 {inclusion => sitemapo}/record/priority.rs | 0 38 files changed, 4 insertions(+), 1244 deletions(-) delete mode 100644 exclusion/build/group.rs delete mode 100644 exclusion/build/mod.rs delete mode 100644 exclusion/build/split.rs delete mode 100644 inclusion/build/auto.rs delete mode 100644 inclusion/build/entry.rs delete mode 100644 inclusion/build/index.rs delete mode 100644 inclusion/build/inner.rs delete mode 100644 inclusion/build/mod.rs delete mode 100644 inclusion/build/plain.rs rename {exclusion => robotxt}/Cargo.toml (100%) rename {exclusion => robotxt}/README.md (100%) rename {exclusion => robotxt}/lib.rs (100%) rename {exclusion => robotxt}/parse/access.rs (100%) rename {exclusion => robotxt}/parse/inner.rs (100%) rename {exclusion => robotxt}/parse/lexer.rs (100%) rename {exclusion => robotxt}/parse/mod.rs (100%) rename {exclusion => robotxt}/parse/parser.rs (100%) rename {exclusion => robotxt}/parse/rule.rs (100%) rename {exclusion => robotxt}/parse/serde.rs (100%) rename {exclusion => robotxt}/paths/create.rs (100%) rename {exclusion => robotxt}/paths/mod.rs (100%) rename {exclusion => robotxt}/paths/normal.rs (100%) rename {inclusion => sitemapo}/Cargo.toml (100%) rename {inclusion => sitemapo}/README.md (100%) rename {inclusion => sitemapo}/lib.rs (100%) rename {inclusion => sitemapo}/parse/auto.rs (100%) rename {inclusion => sitemapo}/parse/entry.rs (100%) rename {inclusion => sitemapo}/parse/index.rs (100%) rename {inclusion => sitemapo}/parse/inner.rs (100%) rename {inclusion => sitemapo}/parse/mod.rs (100%) rename {inclusion => sitemapo}/parse/plain.rs (100%) rename {inclusion => sitemapo}/record/entry.rs (100%) rename {inclusion => sitemapo}/record/frequency.rs (100%) rename {inclusion => sitemapo}/record/index.rs (100%) rename {inclusion => sitemapo}/record/mod.rs (100%) rename {inclusion => sitemapo}/record/priority.rs (100%) diff --git a/Cargo.toml b/Cargo.toml index 388742f..8dc43e9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,8 +4,8 @@ resolver = "2" members = [ "./countio", - "./exclusion", - "./inclusion", + "./robotxt", + "./sitemapo", ] [workspace.package] diff --git a/README.md b/README.md index 312c66b..41e0334 100644 --- a/README.md +++ b/README.md @@ -13,9 +13,9 @@ - [countio]('./countio/): The wrapper struct to enable byte counting for std::io::Read and std::io::Write and its asynchronous variants from futures and tokio. -- [robotxt]('./exclusion/): The implementation of the Robots.txt (or URL +- [robotxt]('./robotxt/): The implementation of the Robots.txt (or URL exclusion) protocol with the support of crawl-delay, sitemap and universal match extensions. -- [sitemapo]('./inclusion/): The implementation of the Sitemap (or URL +- [sitemapo]('./sitemapo/): The implementation of the Sitemap (or URL inclusion) protocol with the support of txt, xml formats and video, image, and news extensions. diff --git a/exclusion/build/group.rs b/exclusion/build/group.rs deleted file mode 100644 index 57f1e2d..0000000 --- a/exclusion/build/group.rs +++ /dev/null @@ -1,178 +0,0 @@ -use std::collections::HashSet; -use std::fmt::{Display, Formatter}; - -use crate::build::format_comment; -use crate::paths::normalize_path; - -/// The single formatted `user-agent` group. -/// -/// See [crate::RobotsBuilder::group]. -#[derive(Debug, Default, Clone)] -pub struct GroupBuilder { - user_agents: HashSet, - rules_disallow: Vec, - rules_allow: Vec, - delay: Option, - - header: Option, - footer: Option, -} - -impl GroupBuilder { - /// Creates a new builder with default settings. - pub fn new() -> Self { - Self::default() - } - - /// Adds a local header, usually used for rule notes. - /// - /// ``` - /// use robotxt::RobotsBuilder; - /// - /// let txt = RobotsBuilder::default() - /// .group(["*"], |u| u.allow("/")) - /// .group(["foobot"], |u| { - /// u.header("Note: Bad Bot!") - /// .disallow("/") - /// .allow("/bad/bot.txt") - /// }); - /// ``` - pub fn header(mut self, header: &str) -> Self { - self.header = Some(header.to_string()); - self - } - - /// Adds an `Allow` directive. - /// - /// ``` - /// use robotxt::RobotsBuilder; - /// - /// let txt = RobotsBuilder::default() - /// .group(["foobot"], |u| { - /// u.allow("/").disallow("/secret.txt") - /// }); - /// ``` - pub fn allow(mut self, rule: &str) -> Self { - let rule = normalize_path(rule); - self.rules_allow.push(rule); - self - } - - /// Adds a `Disallow` directive. - /// - /// ``` - /// use robotxt::RobotsBuilder; - /// - /// let txt = RobotsBuilder::default() - /// .group(["foobot"], |u| { - /// u.allow("/").disallow("/secret.txt") - /// }); - /// ``` - pub fn disallow(mut self, rule: &str) -> Self { - let rule = normalize_path(rule); - self.rules_disallow.push(rule); - self - } - - /// Adds a `Crawl-Delay` directive. - /// - /// ``` - /// use robotxt::RobotsBuilder; - /// - /// let txt = RobotsBuilder::default() - /// .group(["foobot"], |u| { - /// u.crawl_delay(5) - /// }); - /// ``` - pub fn crawl_delay(mut self, delay: u16) -> Self { - self.delay = Some(delay); - self - } - - /// Adds a local footer, usually used for rule notes. - /// - /// ``` - /// use robotxt::RobotsBuilder; - /// - /// let txt = RobotsBuilder::default() - /// .group(["foobot"], |u| { - /// u.footer("Note: Bad Bot!") - /// .disallow("/") - /// .allow("/bad/bot.txt") - /// }); - /// ``` - pub fn footer(mut self, footer: &str) -> Self { - self.footer = Some(footer.to_string()); - self - } -} - -impl<'ua> FromIterator<&'ua str> for GroupBuilder { - fn from_iter>(iter: T) -> Self { - let uas = iter.into_iter().map(|ua| ua.trim().to_string()); - Self { - user_agents: uas.collect(), - ..Self::default() - } - } -} - -impl Display for GroupBuilder { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - let header = self.header.as_ref().map(|h| format_comment(h)); - let footer = self.footer.as_ref().map(|f| format_comment(f)); - let delay = self.delay.map(|d| format!("Crawl-Delay: {d}")); - - let agents = if self.user_agents.is_empty() { - Some("User-Agent: *".to_string()) - } else { - let uas = self.user_agents.iter(); - let uas = uas.map(|ua| format!("User-Agent: {ua}")); - Some(uas.collect::>().join("\n")) - }; - - let disallows = if self.rules_disallow.is_empty() { - None - } else { - let rd = self.rules_disallow.iter(); - let rd = rd.map(|r| format!("Disallow: {r}")); - Some(rd.collect::>().join("\n")) - }; - - let allows = if self.rules_allow.is_empty() { - // Explicit Allow: * if no Disallows. - // Used to interrupt the user-group i.e. - // user-agent: a ..no rules.. user-agent: b - match self.rules_disallow.is_empty() { - true => Some("Allow: *".to_string()), - false => None, - } - } else { - let rd = self.rules_allow.iter(); - let rd = rd.map(|r| format!("Allow: {r}")); - Some(rd.collect::>().join("\n")) - }; - - let result = [header, agents, delay, disallows, allows, footer]; - let result = result.iter().filter_map(|u| u.clone()); - let result = result.collect::>().join("\n"); - write!(f, "{}", result.as_str()) - } -} - -#[cfg(test)] -mod builder { - use super::*; - - #[test] - fn empty_uas() { - let r = GroupBuilder::new().disallow("/foo").to_string(); - assert!(r.contains("User-Agent: *")); - } - - #[test] - fn no_rules() { - let r = GroupBuilder::from_iter(["foobot"]).to_string(); - assert!(r.contains("Allow: *")); - } -} diff --git a/exclusion/build/mod.rs b/exclusion/build/mod.rs deleted file mode 100644 index e2a3046..0000000 --- a/exclusion/build/mod.rs +++ /dev/null @@ -1,146 +0,0 @@ -use std::collections::HashSet; -use std::fmt; - -use url::Url; - -pub use crate::build::group::GroupBuilder; -use crate::build::split::format_comment; - -mod group; -mod split; - -/// The set of formatted `user-agent` groups that can be written -/// in the `robots.txt` compliant format. -#[derive(Debug, Default, Clone)] -pub struct RobotsBuilder { - groups: Vec, - sitemaps: HashSet, - header: Option, - footer: Option, -} - -impl RobotsBuilder { - /// Creates a new [`RobotsBuilder`] with default settings. - pub fn new() -> Self { - Self::default() - } - - /// Adds a global header, usually used for permissions or legal notices. - /// - /// ``` - /// use robotxt::RobotsBuilder; - /// - /// let txt = RobotsBuilder::default() - /// .header("Note: Stop right there!") - /// .group(["*"], |u| u.disallow("/")) - /// .group(["foobot"], |u| u.allow("/")); - /// ``` - pub fn header(mut self, header: &str) -> Self { - self.header = Some(header.to_string()); - self - } - - /// Adds a new `user-agent` group from the provided list of user-agents. - /// - /// ``` - /// use robotxt::RobotsBuilder; - /// - /// let txt = RobotsBuilder::default() - /// .group(["*"], |u| u.disallow("/")) - /// .group(["foobot"], |u| u.allow("/")); - /// ``` - pub fn group<'a>( - mut self, - group: impl IntoIterator, - factory: impl FnOnce(GroupBuilder) -> GroupBuilder, - ) -> Self { - let section = GroupBuilder::from_iter(group); - self.groups.push(factory(section)); - self - } - - /// Adds the `Sitemap` directive from the URL address. - /// - /// ``` - /// use url::Url; - /// use robotxt::RobotsBuilder; - /// - /// let txt = RobotsBuilder::default() - /// .sitemap("https://example.com/sitemap_1.xml".try_into().unwrap()) - /// .sitemap("https://example.com/sitemap_1.xml".try_into().unwrap()); - /// ``` - pub fn sitemap(mut self, sitemap: Url) -> Self { - self.sitemaps.insert(sitemap); - self - } - - /// Adds a global footer, usually used for notices. - /// - /// ``` - /// use robotxt::RobotsBuilder; - /// - /// let txt = RobotsBuilder::default() - /// .group(["*"], |u| u.disallow("/")) - /// .group(["foobot"], |u| u.allow("/")) - /// .footer("Note: Have a nice day!"); - /// ``` - pub fn footer(mut self, footer: &str) -> Self { - self.footer = Some(footer.to_string()); - self - } - - /// Parses the constructed output. - /// See [`Robots::from_bytes`]. - /// - /// [`Robots::from_bytes`]: crate::Robots::from_bytes - #[cfg(feature = "parser")] - #[cfg_attr(docsrs, doc(cfg(feature = "parser")))] - pub fn parse(&self, user_agent: &str) -> crate::Robots { - let txt = self.to_string(); - crate::Robots::from_bytes(txt.as_bytes(), user_agent) - } -} - -impl fmt::Display for RobotsBuilder { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let header = self.header.as_ref().map(|h| format_comment(h)); - let footer = self.footer.as_ref().map(|f| format_comment(f)); - - let groups = self.groups.iter().map(|u| u.to_string()); - let groups = groups.collect::>().join("\n\n"); - - let result = [header, Some(groups), footer]; - let result = result.iter().filter_map(|u| u.clone()); - let result = result.collect::>().join("\n\n"); - write!(f, "{}", result.as_str()) - } -} - -#[cfg(test)] -mod builder { - use crate::{Result, RobotsBuilder}; - - #[test] - fn readme() -> Result<()> { - let txt = RobotsBuilder::default() - .header("Robots.txt: Start") - .group(["foobot"], |u| { - u.crawl_delay(5) - .header("Rules for Foobot: Start") - .allow("/example/yeah.txt") - .disallow("/example/nope.txt") - .footer("Rules for Foobot: End") - }) - .group(["barbot", "nombot"], |u| { - u.crawl_delay(2) - .disallow("/example/yeah.txt") - .disallow("/example/nope.txt") - }) - .sitemap("https://example.com/sitemap_1.xml".try_into()?) - .sitemap("https://example.com/sitemap_2.xml".try_into()?) - .footer("Robots.txt: End"); - - println!("{}", txt.to_string()); - Ok(()) - } -} diff --git a/exclusion/build/split.rs b/exclusion/build/split.rs deleted file mode 100644 index 0bc57ee..0000000 --- a/exclusion/build/split.rs +++ /dev/null @@ -1,15 +0,0 @@ -/// Splits multiline comments into lines and prefixes them with `#`. -pub fn format_comment(txt: &str) -> String { - txt.lines() - .map(|txt| txt.trim()) - .filter(|txt| !txt.is_empty()) - .map(|txt| { - if txt.starts_with('#') { - txt.to_owned() - } else { - format!("# {txt}") - } - }) - .collect::>() - .join("\n") -} diff --git a/inclusion/build/auto.rs b/inclusion/build/auto.rs deleted file mode 100644 index c85dce1..0000000 --- a/inclusion/build/auto.rs +++ /dev/null @@ -1,101 +0,0 @@ -use url::Url; - -use crate::build::{EntryBuilder, IndexBuilder}; -use crate::record::Entry; -use crate::Error; - -/// TODO: Desc. -/// -/// Automatic sitemap file constructor. -/// NOTE: Does not deduplicate records. -/// -/// ```rust -/// #[derive(Debug, thiserror::Error)] -/// enum CustomError { -/// // .. -/// #[error("sitemap error: {0}")] -/// Sitemap(#[from] sitemapo::Error), -/// //.. -/// } -/// -/// fn main() -> Result<(), CustomError> { -/// Ok(()) -/// } -/// ``` -pub struct AutoBuilder { - index: Option>, - entry: Vec>, - queue: Vec, - // factory: impl Fn() -> W, -} - -impl AutoBuilder { - /// TODO: Desc. - pub fn new() -> Self { - todo!() - } -} - -impl AutoBuilder -where - W: std::io::Write, -{ - /// TODO: Desc. - pub fn try_sync(&mut self, fetcher: A) -> Result<(), E> - where - E: std::error::Error + From, - A: Fn(Url) -> Result, E>, - { - // if let Some(builder) = self.entry.as_mut() { - // builder.write(record) - // } - - todo!() - } -} - -#[cfg(feature = "tokio")] -#[cfg_attr(docsrs, doc(cfg(feature = "tokio")))] -impl AutoBuilder -where - W: tokio::io::AsyncWrite + Unpin + Send, -{ - /// TODO: Desc. - pub async fn try_async(&mut self) -> Result<(), Error> { - todo!() - } -} - -impl std::fmt::Debug for AutoBuilder { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - // TODO: Debug. - f.debug_struct("AutoBuilder").finish() - } -} - -// impl Default for AutoBuilder { -// fn default() -> Self { -// Self { -// entry: None, -// index: None, -// } -// } -// } - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn sync() -> Result<(), Error> { - // TODO: Test. - Ok(()) - } - - #[cfg(feature = "tokio")] - #[tokio::test] - async fn asynk() -> Result<(), Error> { - // TODO: Test. - Ok(()) - } -} diff --git a/inclusion/build/entry.rs b/inclusion/build/entry.rs deleted file mode 100644 index 46508a5..0000000 --- a/inclusion/build/entry.rs +++ /dev/null @@ -1,268 +0,0 @@ -use std::io::Write; - -use quick_xml::{events, Writer}; -use time::format_description::well_known::Iso8601; - -use crate::build::{Builder, InnerBuilder, CONFIG}; -use crate::record::*; -use crate::{Error, Result}; - -/// Sitemap builder for the versatile XML file with an optional support of extensions. -/// -/// For example: -/// -/// ```xml -/// -/// -/// -/// https://www.example.com/foo.html -/// 2022-06-04 -/// -/// -/// ``` -/// -/// Enforces total written/read bytes and total records limits. -/// See [Error]. -/// -/// ```rust -/// use sitemapo::build::{Builder, EntryBuilder}; -/// use sitemapo::record::Entry; -/// -/// fn main() -> sitemapo::Result<()> { -/// let buf = Vec::new(); -/// let url = "https://example.com/".try_into().unwrap(); -/// let rec = Entry::new(url); -/// -/// let mut builder = EntryBuilder::new(buf)?; -/// builder.write(&rec)?; -/// let _buf = builder.close()?; -/// Ok(()) -/// } -/// ``` -pub struct EntryBuilder { - inner: InnerBuilder, -} - -impl EntryBuilder { - /// Creates a new instance with the given writer. - pub(crate) fn from_writer(writer: W) -> Self { - let inner = InnerBuilder::from_writer(writer); - Self::from_inner(inner) - } - - /// Creates a new instance with the given inner parser. - pub(crate) fn from_inner(inner: InnerBuilder) -> Self { - Self { inner } - } - - /// Returns a reference to the underlying writer. - pub fn get_ref(&self) -> &W { - self.inner.get_ref() - } - - /// Returns a mutable reference to the underlying writer. - pub fn get_mut(&mut self) -> &mut W { - self.inner.get_mut() - } - - /// Returns an underlying writer. - pub fn into_inner(self) -> W { - self.inner.into_inner() - } - - pub(crate) fn create_entry_open(&mut self) -> Result> { - self.inner.create_open_tag(URL_SET) - } - - pub(crate) fn create_entry_record(&mut self, record: &Entry) -> Result> { - if self.inner.records + 1 > RECORD_LIMIT { - return Err(Error::EntryLimit { over: 1 }); - } - - let format = &Iso8601::<{ CONFIG }>; - let location = record.location.to_string(); - let modified = record.modified.map(|u| u.format(format).unwrap()); - let priority = record.priority.map(|u| u.to_string()); - let frequency = record.frequency.map(|u| u.to_string()); - - let mut temp = Writer::new(Vec::new()); - let element = temp.create_element(URL); - element.write_inner_content(|writer| -> quick_xml::Result<()> { - let tag = writer.create_element(LOCATION); - tag.write_text_content(events::BytesText::new(&location))?; - - if let Some(modified) = modified { - let tag = writer.create_element(LAST_MODIFIED); - tag.write_text_content(events::BytesText::new(&modified))?; - } - - if let Some(priority) = priority { - let tag = writer.create_element(PRIORITY); - tag.write_text_content(events::BytesText::new(&priority))?; - } - - if let Some(frequency) = frequency { - let tag = writer.create_element(CHANGE_FREQUENCY); - tag.write_text_content(events::BytesText::new(&frequency))?; - } - - Ok(()) - })?; - - let buf = temp.into_inner(); - if buf.len() > BYTE_LIMIT { - let over_limit = buf.len() - BYTE_LIMIT; - return Err(Error::ByteLimit { over: over_limit }); - } - - Ok(buf) - } - - pub(crate) fn create_entry_close(&mut self) -> Result> { - self.inner.create_close_tag(URL_SET) - } -} - -impl std::fmt::Debug for EntryBuilder { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("EntryBuilder") - .field("inner", &self.inner) - .finish() - } -} - -impl Builder for EntryBuilder { - type Error = Error; - - fn new(writer: W) -> Result { - let mut this = Self::from_writer(writer); - let temp = this.create_entry_open()?; - this.inner.writer.write_all(&temp)?; - Ok(this) - } - - fn write(&mut self, record: &Entry) -> Result<()> { - let temp = self.create_entry_record(record)?; - self.inner.writer.write_all(&temp)?; - self.inner.records += 1; - Ok(()) - } - - fn close(mut self) -> Result { - let temp = self.create_entry_close()?; - self.inner.writer.write_all(&temp)?; - Ok(self.into_inner()) - } -} - -#[cfg(feature = "tokio")] -#[cfg_attr(docsrs, doc(cfg(feature = "tokio")))] -mod tokio { - use async_trait::async_trait; - use tokio::io::{AsyncWrite, AsyncWriteExt}; - - use crate::build::{AsyncBuilder, EntryBuilder}; - use crate::record::Entry; - use crate::{Error, Result}; - - #[async_trait] - impl AsyncBuilder for EntryBuilder { - type Error = Error; - - async fn new(writer: W) -> Result { - let mut this = Self::from_writer(writer); - let temp = this.create_entry_open()?; - this.inner.writer.write_all(&temp).await?; - Ok(this) - } - - async fn write(&mut self, record: &Entry) -> Result<()> { - let temp = self.create_entry_record(record)?; - self.inner.writer.write_all(&temp).await?; - self.inner.records += 1; - Ok(()) - } - - async fn close(mut self) -> Result { - let temp = self.create_entry_close()?; - self.inner.writer.write_all(&temp).await?; - Ok(self.into_inner()) - } - } -} - -#[cfg(test)] -mod test { - use std::io::BufWriter; - - use url::Url; - - use crate::build::{Builder, EntryBuilder}; - use crate::record::Entry; - use crate::Result; - - #[test] - fn synk() -> Result<()> { - let buf = Vec::new(); - let mut builder = EntryBuilder::new(buf)?; - - let url = Url::parse("https://example.com/").unwrap(); - let rec = Entry::new(url); - builder.write(&rec)?; - let _buf = builder.close()?; - - Ok(()) - } - - #[test] - fn synk_with_buf() -> Result<()> { - let buf = BufWriter::new(Vec::new()); - let mut builder = EntryBuilder::new(buf)?; - - let url = Url::parse("https://example.com/").unwrap(); - let rec = Entry::new(url); - builder.write(&rec)?; - let _buf = builder.close()?; - - Ok(()) - } -} - -#[cfg(feature = "tokio")] -#[cfg(test)] -mod tokio_test { - use tokio::io::{AsyncWriteExt, BufWriter}; - use url::Url; - - use crate::build::{AsyncBuilder, EntryBuilder}; - use crate::{record::Entry, Result}; - - #[tokio::test] - async fn asynk() -> Result<()> { - let buf = Vec::new(); - let mut builder = EntryBuilder::new(buf).await?; - - let url = Url::parse("https://example.com/").unwrap(); - let rec = Entry::new(url); - builder.write(&rec).await?; - let _buf = builder.close().await?; - - Ok(()) - } - - #[tokio::test] - async fn asynk_with_buf() -> Result<()> { - let buf = BufWriter::new(Vec::new()); - let mut builder = EntryBuilder::new(buf).await?; - - let url = Url::parse("https://example.com/").unwrap(); - - let rec = Entry::new(url); - builder.write(&rec).await?; - let mut buf = builder.close().await?; - - let _ = buf.flush().await?; - - Ok(()) - } -} diff --git a/inclusion/build/index.rs b/inclusion/build/index.rs deleted file mode 100644 index 10222b1..0000000 --- a/inclusion/build/index.rs +++ /dev/null @@ -1,180 +0,0 @@ -use std::io::Write; - -use quick_xml::{events, Writer}; -use time::format_description::well_known::Iso8601; - -use crate::build::{Builder, InnerBuilder, CONFIG}; -use crate::record::*; -use crate::{Error, Result}; - -/// Sitemap index parser for the versatile XML file. -/// -/// For example: -/// -/// ```xml -/// -/// -/// -/// http://www.example.com/sitemap.xml.gz -/// 2004-10-01T18:23:17+00:00 -/// -/// -/// ``` -/// -/// Enforces total written/read bytes and total records limits. -/// See [Error]. -/// -/// ```rust -/// use sitemapo::build::{Builder, IndexBuilder}; -/// use sitemapo::record::Index; -/// -/// fn main() -> sitemapo::Result<()> { -/// let buf = Vec::new(); -/// let url = "https://example.com/".try_into().unwrap(); -/// let rec = Index::new(url); -/// -/// let mut builder = IndexBuilder::new(buf)?; -/// builder.write(&rec)?; -/// let _buf = builder.close()?; -/// Ok(()) -/// } -/// ``` -pub struct IndexBuilder { - inner: InnerBuilder, -} - -impl IndexBuilder { - /// Creates a new instance with the given writer. - pub(crate) fn from_writer(writer: W) -> Self { - let inner = InnerBuilder::from_writer(writer); - Self::from_inner(inner) - } - - /// Creates a new instance with the given inner parser. - pub(crate) fn from_inner(inner: InnerBuilder) -> Self { - Self { inner } - } - - /// Returns a reference to the underlying writer. - pub fn get_ref(&self) -> &W { - self.inner.get_ref() - } - - /// Returns a mutable reference to the underlying writer. - pub fn get_mut(&mut self) -> &mut W { - self.inner.get_mut() - } - - /// Returns an underlying writer. - pub fn into_inner(self) -> W { - self.inner.into_inner() - } - - pub(crate) fn create_index_open(&mut self) -> Result> { - self.inner.create_open_tag(SITEMAP_INDEX) - } - - pub(crate) fn create_index_record(&mut self, record: &Index) -> Result> { - if self.inner.records + 1 > RECORD_LIMIT { - return Err(Error::EntryLimit { over: 1 }); - } - - let format = &Iso8601::<{ CONFIG }>; - let location = record.location.to_string(); - let modified = record.modified.map(|u| u.format(format).unwrap()); - - let mut temp = Writer::new(Vec::new()); - let element = temp.create_element(SITEMAP); - element.write_inner_content(|writer| -> quick_xml::Result<()> { - let tag = writer.create_element(LOCATION); - tag.write_text_content(events::BytesText::new(&location))?; - - if let Some(modified) = modified { - let tag = writer.create_element(LAST_MODIFIED); - tag.write_text_content(events::BytesText::new(&modified))?; - } - - Ok(()) - })?; - - let buf = temp.into_inner(); - if buf.len() > BYTE_LIMIT { - let over_limit = buf.len() - BYTE_LIMIT; - return Err(Error::ByteLimit { over: over_limit }); - } - - Ok(buf) - } - - pub(crate) fn create_index_close(&mut self) -> Result> { - self.inner.create_close_tag(SITEMAP_INDEX) - } -} - -impl std::fmt::Debug for IndexBuilder { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("IndexBuilder") - .field("inner", &self.inner) - .finish() - } -} - -impl Builder for IndexBuilder { - type Error = Error; - - fn new(writer: W) -> Result { - let mut this = Self::from_writer(writer); - let temp = this.create_index_open()?; - this.inner.writer.write_all(&temp)?; - Ok(this) - } - - fn write(&mut self, record: &Index) -> Result<()> { - let temp = self.create_index_record(record)?; - self.inner.writer.write_all(&temp)?; - self.inner.records += 1; - Ok(()) - } - - fn close(mut self) -> Result { - let temp = self.create_index_close()?; - self.inner.writer.write_all(&temp)?; - Ok(self.into_inner()) - } -} - -#[cfg(feature = "tokio")] -#[cfg_attr(docsrs, doc(cfg(feature = "tokio")))] -mod tokio { - use async_trait::async_trait; - use tokio::io::{AsyncWrite, AsyncWriteExt}; - - use crate::build::{AsyncBuilder, IndexBuilder}; - use crate::record::Index; - use crate::{Error, Result}; - - #[async_trait] - impl AsyncBuilder for IndexBuilder { - type Error = Error; - - async fn new(writer: W) -> Result { - let mut this = Self::from_writer(writer); - let temp = this.create_index_open()?; - this.inner.writer.write_all(&temp).await?; - Ok(this) - } - - async fn write(&mut self, record: &Index) -> Result<()> { - let temp = self.create_index_record(record)?; - self.inner.writer.write_all(&temp).await?; - self.inner.records += 1; - Ok(()) - } - - async fn close(mut self) -> Result { - let temp = self.create_index_close()?; - self.inner.writer.write_all(&temp).await?; - Ok(self.into_inner()) - } - } -} diff --git a/inclusion/build/inner.rs b/inclusion/build/inner.rs deleted file mode 100644 index 42dea65..0000000 --- a/inclusion/build/inner.rs +++ /dev/null @@ -1,84 +0,0 @@ -use std::{marker::PhantomData, num::NonZeroU8}; - -use countio::Counter; -use quick_xml::{events, Writer}; -use time::format_description::well_known::iso8601; - -use crate::Error; - -pub(crate) const CONFIG: iso8601::EncodedConfig = iso8601::Config::DEFAULT - .set_time_precision(iso8601::TimePrecision::Second { - decimal_digits: NonZeroU8::new(2), - }) - .encode(); - -pub(crate) struct InnerBuilder { - pub(crate) record: PhantomData, - pub(crate) writer: Counter, - pub(crate) records: usize, -} - -impl InnerBuilder { - /// Creates a new instance with a provided writer. - pub fn from_writer(writer: W) -> Self { - Self { - record: PhantomData, - writer: Counter::new(writer), - records: 0, - } - } - - /// Returns a reference to the underlying writer. - pub fn get_ref(&self) -> &W { - self.writer.get_ref() - } - - /// Returns a mutable reference to the underlying writer. - pub fn get_mut(&mut self) -> &mut W { - self.writer.get_mut() - } - - /// Returns an underlying writer. - pub fn into_inner(self) -> W { - self.writer.into_inner() - } - - pub fn create_open_tag(&mut self, tag: &str) -> Result, Error> { - let mut temp = Writer::new(Vec::new()); - temp.write_bom()?; - - // - let decl = events::BytesDecl::new("1.0", Some("UTF-8"), None); - temp.write_event(events::Event::Decl(decl))?; - - // - // - const XMLNS: [(&str, &str); 1] = [("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")]; - - let tag = events::BytesStart::new(tag); - let tag = tag.with_attributes(XMLNS); - temp.write_event(events::Event::Start(tag))?; - - Ok(temp.into_inner()) - } - - pub fn create_close_tag(&mut self, tag: &str) -> Result, Error> { - let mut temp = Writer::new(Vec::new()); - - // - // - let tag = events::BytesEnd::new(tag); - temp.write_event(events::Event::End(tag))?; - - Ok(temp.into_inner()) - } -} - -impl std::fmt::Debug for InnerBuilder { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("XmlBuilder") - .field("bytes", &self.writer.writer_bytes()) - .field("records", &self.records) - .finish() - } -} diff --git a/inclusion/build/mod.rs b/inclusion/build/mod.rs deleted file mode 100644 index 9a60e67..0000000 --- a/inclusion/build/mod.rs +++ /dev/null @@ -1,44 +0,0 @@ -mod auto; -mod entry; -mod index; -mod inner; -mod plain; - -pub use auto::*; -pub use entry::*; -pub use index::*; -pub(crate) use inner::*; -pub use plain::*; - -// TODO: Make builders take BufWrite. - -/// Core trait for the builder implementation. -pub trait Builder: Sized { - type Error: std::error::Error; - - // Creates a new `Builder` instance. - fn new(writer: W) -> Result; - - /// Writes another record into the underlying writer. - fn write(&mut self, record: &D) -> Result<(), Self::Error>; - - /// Closes tags if needed and releases the writer. - fn close(self) -> Result; -} - -/// Core trait for the async builder implementation. -#[cfg(feature = "tokio")] -#[cfg_attr(docsrs, doc(cfg(feature = "tokio")))] -#[async_trait::async_trait] -pub trait AsyncBuilder: Sized { - type Error: std::error::Error; - - // Creates a new `AsyncBuilder` instance. - async fn new(writer: W) -> Result; - - /// Writes another record into the underlying writer. - async fn write(&mut self, record: &D) -> Result<(), Self::Error>; - - /// Closes tags if needed and releases the writer. - async fn close(self) -> Result; -} diff --git a/inclusion/build/plain.rs b/inclusion/build/plain.rs deleted file mode 100644 index 1b82a75..0000000 --- a/inclusion/build/plain.rs +++ /dev/null @@ -1,224 +0,0 @@ -use std::io::Write; - -use countio::Counter; -use url::Url; - -use crate::build::Builder; -use crate::record::*; -use crate::{Error, Result}; - -/// Sitemap builder for the simple TXT file that contains one URL per line. -/// -/// For example: -/// -/// ```txt -/// https://www.example.com/file1.html -/// https://www.example.com/file2.html -/// ``` -/// -/// Enforces [total written/read bytes](BYTE_LIMIT) and [total records](RECORD_LIMIT) limits. -/// See [Error]. -/// -/// ```rust -/// use sitemapo::build::{Builder, PlainBuilder}; -/// -/// fn main() -> sitemapo::Result<()> { -/// let buf = Vec::new(); -/// let rec = "https://example.com/".try_into().unwrap(); -/// -/// let mut builder = PlainBuilder::new(buf)?; -/// builder.write(&rec)?; -/// let _buf = builder.close()?; -/// Ok(()) -/// } -/// ``` -pub struct PlainBuilder { - writer: Counter, - records: usize, -} - -impl PlainBuilder { - /// Returns a reference to the underlying writer. - pub fn get_ref(&self) -> &W { - self.writer.get_ref() - } - - /// Returns a mutable reference to the underlying writer. - pub fn get_mut(&mut self) -> &mut W { - self.writer.get_mut() - } - - /// Returns an underlying writer. - pub fn into_inner(self) -> W { - self.writer.into_inner() - } -} - -impl PlainBuilder { - /// Creates a new instance with a provided writer. - pub(crate) fn from_writer(writer: W) -> Self { - Self { - writer: Counter::new(writer), - records: 0, - } - } - - pub(crate) fn create_next_line(&mut self, url: &Url) -> Result> { - const NEWLINE: &str = "\n"; - - if self.records + 1 > RECORD_LIMIT { - return Err(Error::EntryLimit { over: 1 }); - } - - let record = url.to_string(); - let record_bytes = record.len() + NEWLINE.len(); - let total_bytes = self.writer.writer_bytes() + record_bytes; - if total_bytes > BYTE_LIMIT { - let over_limit = total_bytes - BYTE_LIMIT; - return Err(Error::ByteLimit { over: over_limit }); - } - - Ok((record + NEWLINE).into_bytes()) - } -} - -impl Builder for PlainBuilder { - type Error = Error; - - fn new(writer: W) -> Result { - Ok(Self::from_writer(writer)) - } - - fn write(&mut self, record: &Url) -> Result<()> { - let record = self.create_next_line(record)?; - self.writer.write_all(&record)?; - self.records += 1; - Ok(()) - } - - fn close(self) -> Result { - Ok(self.into_inner()) - } -} - -impl std::fmt::Debug for PlainBuilder { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("TxtBuilder") - .field("bytes", &self.writer.writer_bytes()) - .field("records", &self.records) - .finish() - } -} - -#[cfg(feature = "tokio")] -#[cfg_attr(docsrs, doc(cfg(feature = "tokio")))] -mod tokio { - use async_trait::async_trait; - use tokio::io::{AsyncWrite, AsyncWriteExt}; - use url::Url; - - use crate::build::{AsyncBuilder, PlainBuilder}; - use crate::{Error, Result}; - - #[async_trait] - impl AsyncBuilder for PlainBuilder { - type Error = Error; - - async fn new(writer: W) -> Result { - Ok(Self::from_writer(writer)) - } - - async fn write(&mut self, record: &Url) -> Result<()> { - let record = self.create_next_line(record)?; - self.writer.write_all(&record).await?; - self.records += 1; - Ok(()) - } - - async fn close(self) -> Result { - Ok(self.into_inner()) - } - } -} - -#[cfg(test)] -mod test { - use std::io::BufWriter; - use url::Url; - - use crate::build::{Builder, PlainBuilder}; - use crate::Result; - - #[test] - fn synk() -> Result<()> { - let buf = Vec::new(); - let mut builder = PlainBuilder::new(buf).unwrap(); - - let url = Url::parse("https://example.com/").unwrap(); - builder.write(&url).unwrap(); - let buf = builder.close().unwrap(); - - let exp = String::from_utf8(buf).unwrap(); - assert_eq!(url.to_string() + "\n", exp); - - Ok(()) - } - - #[test] - fn synk_with_buf() -> Result<()> { - let buf = BufWriter::new(Vec::new()); - let mut builder = PlainBuilder::new(buf)?; - - let url = Url::parse("https://example.com/").unwrap(); - builder.write(&url)?; - let buf = builder.close()?; - - let buf = buf.into_inner().unwrap(); - let exp = String::from_utf8(buf).unwrap(); - assert_eq!(url.to_string() + "\n", exp); - - Ok(()) - } -} - -#[cfg(feature = "tokio")] -#[cfg(test)] -mod tokio_test { - use tokio::io::{AsyncWriteExt, BufWriter}; - use url::Url; - - use crate::build::{AsyncBuilder, PlainBuilder}; - use crate::Result; - - #[tokio::test] - async fn asynk() -> Result<()> { - let buf = Vec::new(); - let mut builder = PlainBuilder::new(buf).await?; - - let url = Url::parse("https://example.com/").unwrap(); - builder.write(&url).await?; - let buf = builder.close().await?; - - let exp = String::from_utf8(buf); - assert_eq!(Ok(url.to_string() + "\n"), exp); - - Ok(()) - } - - #[tokio::test] - async fn asynk_with_buf() -> Result<()> { - let buf = BufWriter::new(Vec::new()); - let mut builder = PlainBuilder::new(buf).await?; - - let url = Url::parse("https://example.com/").unwrap(); - builder.write(&url).await?; - let mut buf = builder.close().await?; - - let _ = buf.flush().await?; - let buf = buf.into_inner(); - let exp = String::from_utf8(buf); - assert_eq!(Ok(url.to_string() + "\n"), exp); - - Ok(()) - } -} diff --git a/exclusion/Cargo.toml b/robotxt/Cargo.toml similarity index 100% rename from exclusion/Cargo.toml rename to robotxt/Cargo.toml diff --git a/exclusion/README.md b/robotxt/README.md similarity index 100% rename from exclusion/README.md rename to robotxt/README.md diff --git a/exclusion/lib.rs b/robotxt/lib.rs similarity index 100% rename from exclusion/lib.rs rename to robotxt/lib.rs diff --git a/exclusion/parse/access.rs b/robotxt/parse/access.rs similarity index 100% rename from exclusion/parse/access.rs rename to robotxt/parse/access.rs diff --git a/exclusion/parse/inner.rs b/robotxt/parse/inner.rs similarity index 100% rename from exclusion/parse/inner.rs rename to robotxt/parse/inner.rs diff --git a/exclusion/parse/lexer.rs b/robotxt/parse/lexer.rs similarity index 100% rename from exclusion/parse/lexer.rs rename to robotxt/parse/lexer.rs diff --git a/exclusion/parse/mod.rs b/robotxt/parse/mod.rs similarity index 100% rename from exclusion/parse/mod.rs rename to robotxt/parse/mod.rs diff --git a/exclusion/parse/parser.rs b/robotxt/parse/parser.rs similarity index 100% rename from exclusion/parse/parser.rs rename to robotxt/parse/parser.rs diff --git a/exclusion/parse/rule.rs b/robotxt/parse/rule.rs similarity index 100% rename from exclusion/parse/rule.rs rename to robotxt/parse/rule.rs diff --git a/exclusion/parse/serde.rs b/robotxt/parse/serde.rs similarity index 100% rename from exclusion/parse/serde.rs rename to robotxt/parse/serde.rs diff --git a/exclusion/paths/create.rs b/robotxt/paths/create.rs similarity index 100% rename from exclusion/paths/create.rs rename to robotxt/paths/create.rs diff --git a/exclusion/paths/mod.rs b/robotxt/paths/mod.rs similarity index 100% rename from exclusion/paths/mod.rs rename to robotxt/paths/mod.rs diff --git a/exclusion/paths/normal.rs b/robotxt/paths/normal.rs similarity index 100% rename from exclusion/paths/normal.rs rename to robotxt/paths/normal.rs diff --git a/inclusion/Cargo.toml b/sitemapo/Cargo.toml similarity index 100% rename from inclusion/Cargo.toml rename to sitemapo/Cargo.toml diff --git a/inclusion/README.md b/sitemapo/README.md similarity index 100% rename from inclusion/README.md rename to sitemapo/README.md diff --git a/inclusion/lib.rs b/sitemapo/lib.rs similarity index 100% rename from inclusion/lib.rs rename to sitemapo/lib.rs diff --git a/inclusion/parse/auto.rs b/sitemapo/parse/auto.rs similarity index 100% rename from inclusion/parse/auto.rs rename to sitemapo/parse/auto.rs diff --git a/inclusion/parse/entry.rs b/sitemapo/parse/entry.rs similarity index 100% rename from inclusion/parse/entry.rs rename to sitemapo/parse/entry.rs diff --git a/inclusion/parse/index.rs b/sitemapo/parse/index.rs similarity index 100% rename from inclusion/parse/index.rs rename to sitemapo/parse/index.rs diff --git a/inclusion/parse/inner.rs b/sitemapo/parse/inner.rs similarity index 100% rename from inclusion/parse/inner.rs rename to sitemapo/parse/inner.rs diff --git a/inclusion/parse/mod.rs b/sitemapo/parse/mod.rs similarity index 100% rename from inclusion/parse/mod.rs rename to sitemapo/parse/mod.rs diff --git a/inclusion/parse/plain.rs b/sitemapo/parse/plain.rs similarity index 100% rename from inclusion/parse/plain.rs rename to sitemapo/parse/plain.rs diff --git a/inclusion/record/entry.rs b/sitemapo/record/entry.rs similarity index 100% rename from inclusion/record/entry.rs rename to sitemapo/record/entry.rs diff --git a/inclusion/record/frequency.rs b/sitemapo/record/frequency.rs similarity index 100% rename from inclusion/record/frequency.rs rename to sitemapo/record/frequency.rs diff --git a/inclusion/record/index.rs b/sitemapo/record/index.rs similarity index 100% rename from inclusion/record/index.rs rename to sitemapo/record/index.rs diff --git a/inclusion/record/mod.rs b/sitemapo/record/mod.rs similarity index 100% rename from inclusion/record/mod.rs rename to sitemapo/record/mod.rs diff --git a/inclusion/record/priority.rs b/sitemapo/record/priority.rs similarity index 100% rename from inclusion/record/priority.rs rename to sitemapo/record/priority.rs From 21849dddc70b406950ed2a52f2620009402fa623 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Thu, 14 Mar 2024 13:45:49 +0100 Subject: [PATCH 02/11] fix(all): meta, docs --- LICENSE.txt | 2 +- README.md | 18 +++++++++--------- countio/Cargo.toml | 4 ++-- countio/README.md | 5 ++--- robotxt/Cargo.toml | 6 +++--- sitemapo/Cargo.toml | 6 +++--- 6 files changed, 20 insertions(+), 21 deletions(-) diff --git a/LICENSE.txt b/LICENSE.txt index 5817ff0..e481219 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2023 Ethical Web Scraping +Copyright (c) 2023 spire-rs Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 41e0334..18ffda3 100644 --- a/README.md +++ b/README.md @@ -10,12 +10,12 @@ #### Crates: -- [countio]('./countio/): The wrapper struct to enable byte counting for - std::io::Read and std::io::Write and its asynchronous variants from futures - and tokio. -- [robotxt]('./robotxt/): The implementation of the Robots.txt (or URL - exclusion) protocol with the support of crawl-delay, sitemap and universal - match extensions. -- [sitemapo]('./sitemapo/): The implementation of the Sitemap (or URL - inclusion) protocol with the support of txt, xml formats and video, image, and - news extensions. +- [countio](./countio/): The wrapper struct to enable byte counting for + `std::io::{Read, Write, Seek}` and its async variants from `futures` and + `tokio`. +- [robotxt](./robotxt/): The implementation of the Robots.txt (or URL exclusion) + protocol with the support of crawl-delay, sitemap and universal match + extensions. +- [sitemapo](./sitemapo/): The implementation of the Sitemap (or URL inclusion) + protocol with the support of txt, xml formats and video, image, and news + extensions. diff --git a/countio/Cargo.toml b/countio/Cargo.toml index 741db32..fb9f3d8 100644 --- a/countio/Cargo.toml +++ b/countio/Cargo.toml @@ -9,8 +9,8 @@ edition = { workspace = true } license = { workspace = true } authors = { workspace = true } -repository = "https://github.com/spire-rs/kit/countio" -homepage = "https://github.com/spire-rs/kit/countio" +repository = "https://github.com/spire-rs/kit" +homepage = "https://github.com/spire-rs/kit" documentation = "https://docs.rs/countio" categories = ["parsing", "asynchronous"] keywords = ["byte", "tokio", "futures", "parsing"] diff --git a/countio/README.md b/countio/README.md index 7a75754..1255629 100644 --- a/countio/README.md +++ b/countio/README.md @@ -17,9 +17,8 @@ [coverage-badge]: https://img.shields.io/codecov/c/github/spire-rs/kit?logo=codecov&logoColor=white&style=flat-square [coverage-url]: https://app.codecov.io/gh/spire-rs/kit -The wrapper struct to enable byte counting for `std::io::Read`, -`std::io::Write`, `std::io::Seek` and its asynchronous variants from `futures` -and `tokio` crates. +The wrapper struct to enable byte counting for `std::io::{Read, Write, Seek}` +and its asynchronous variants from `futures` and `tokio` crates. ### Features diff --git a/robotxt/Cargo.toml b/robotxt/Cargo.toml index 27b3388..4c26d77 100644 --- a/robotxt/Cargo.toml +++ b/robotxt/Cargo.toml @@ -9,11 +9,11 @@ edition = { workspace = true } license = { workspace = true } authors = { workspace = true } -repository = "https://github.com/spire-rs/kit/exclusion" -homepage = "https://github.com/spire-rs/kit/exclusion" +repository = "https://github.com/spire-rs/kit" +homepage = "https://github.com/spire-rs/kit" documentation = "https://docs.rs/robotxt" categories = ["asynchronous", "web-programming"] -keywords = ["crawler", "scraper", "web", "framework"] +keywords = ["robots", "robot", "exclusion", "crawler", "scraper"] description = """ The implementation of the Robots.txt (or URL exclusion) protocol with the support of crawl-delay, sitemap and universal match extensions. diff --git a/sitemapo/Cargo.toml b/sitemapo/Cargo.toml index ce883f8..1722c07 100644 --- a/sitemapo/Cargo.toml +++ b/sitemapo/Cargo.toml @@ -9,11 +9,11 @@ edition = { workspace = true } license = { workspace = true } authors = { workspace = true } -repository = "https://github.com/spire-rs/kit/inclusion" -homepage = "https://github.com/spire-rs/kit/inclusion" +repository = "https://github.com/spire-rs/kit" +homepage = "https://github.com/spire-rs/kit" documentation = "https://docs.rs/sitemapo" categories = ["parser-implementations", "web-programming"] -keywords = ["sitemap", "crawler", "parser"] +keywords = ["sitemaps", "sitemap", "inclusion", "crawler", "scraper"] description = """ The implementation of the Sitemap.xml (or URL inclusion) protocol with the support of txt & xml formats, and video, image, news extensions. From 32e74239e868ef9689325cdb71fc9a8d4a8bc381 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Thu, 14 Mar 2024 13:50:04 +0100 Subject: [PATCH 03/11] fix(all): fix gitignore --- .gitignore | 4 - README.md | 2 +- robotxt/build/group.rs | 178 ++++++++++++++++++++++++++ robotxt/build/mod.rs | 146 ++++++++++++++++++++++ robotxt/build/split.rs | 15 +++ sitemapo/build/auto.rs | 101 +++++++++++++++ sitemapo/build/entry.rs | 268 ++++++++++++++++++++++++++++++++++++++++ sitemapo/build/index.rs | 180 +++++++++++++++++++++++++++ sitemapo/build/inner.rs | 84 +++++++++++++ sitemapo/build/mod.rs | 44 +++++++ sitemapo/build/plain.rs | 224 +++++++++++++++++++++++++++++++++ 11 files changed, 1241 insertions(+), 5 deletions(-) create mode 100644 robotxt/build/group.rs create mode 100644 robotxt/build/mod.rs create mode 100644 robotxt/build/split.rs create mode 100644 sitemapo/build/auto.rs create mode 100644 sitemapo/build/entry.rs create mode 100644 sitemapo/build/index.rs create mode 100644 sitemapo/build/inner.rs create mode 100644 sitemapo/build/mod.rs create mode 100644 sitemapo/build/plain.rs diff --git a/.gitignore b/.gitignore index 1103880..bb27ea8 100644 --- a/.gitignore +++ b/.gitignore @@ -13,10 +13,6 @@ debug/ target/ **/*.rs.bk -# Output -dist/ -build/ - # Environment env/ .env diff --git a/README.md b/README.md index 18ffda3..f490897 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ `std::io::{Read, Write, Seek}` and its async variants from `futures` and `tokio`. - [robotxt](./robotxt/): The implementation of the Robots.txt (or URL exclusion) - protocol with the support of crawl-delay, sitemap and universal match + protocol with the support of `crawl-delay`, `sitemap` and universal `*` match extensions. - [sitemapo](./sitemapo/): The implementation of the Sitemap (or URL inclusion) protocol with the support of txt, xml formats and video, image, and news diff --git a/robotxt/build/group.rs b/robotxt/build/group.rs new file mode 100644 index 0000000..57f1e2d --- /dev/null +++ b/robotxt/build/group.rs @@ -0,0 +1,178 @@ +use std::collections::HashSet; +use std::fmt::{Display, Formatter}; + +use crate::build::format_comment; +use crate::paths::normalize_path; + +/// The single formatted `user-agent` group. +/// +/// See [crate::RobotsBuilder::group]. +#[derive(Debug, Default, Clone)] +pub struct GroupBuilder { + user_agents: HashSet, + rules_disallow: Vec, + rules_allow: Vec, + delay: Option, + + header: Option, + footer: Option, +} + +impl GroupBuilder { + /// Creates a new builder with default settings. + pub fn new() -> Self { + Self::default() + } + + /// Adds a local header, usually used for rule notes. + /// + /// ``` + /// use robotxt::RobotsBuilder; + /// + /// let txt = RobotsBuilder::default() + /// .group(["*"], |u| u.allow("/")) + /// .group(["foobot"], |u| { + /// u.header("Note: Bad Bot!") + /// .disallow("/") + /// .allow("/bad/bot.txt") + /// }); + /// ``` + pub fn header(mut self, header: &str) -> Self { + self.header = Some(header.to_string()); + self + } + + /// Adds an `Allow` directive. + /// + /// ``` + /// use robotxt::RobotsBuilder; + /// + /// let txt = RobotsBuilder::default() + /// .group(["foobot"], |u| { + /// u.allow("/").disallow("/secret.txt") + /// }); + /// ``` + pub fn allow(mut self, rule: &str) -> Self { + let rule = normalize_path(rule); + self.rules_allow.push(rule); + self + } + + /// Adds a `Disallow` directive. + /// + /// ``` + /// use robotxt::RobotsBuilder; + /// + /// let txt = RobotsBuilder::default() + /// .group(["foobot"], |u| { + /// u.allow("/").disallow("/secret.txt") + /// }); + /// ``` + pub fn disallow(mut self, rule: &str) -> Self { + let rule = normalize_path(rule); + self.rules_disallow.push(rule); + self + } + + /// Adds a `Crawl-Delay` directive. + /// + /// ``` + /// use robotxt::RobotsBuilder; + /// + /// let txt = RobotsBuilder::default() + /// .group(["foobot"], |u| { + /// u.crawl_delay(5) + /// }); + /// ``` + pub fn crawl_delay(mut self, delay: u16) -> Self { + self.delay = Some(delay); + self + } + + /// Adds a local footer, usually used for rule notes. + /// + /// ``` + /// use robotxt::RobotsBuilder; + /// + /// let txt = RobotsBuilder::default() + /// .group(["foobot"], |u| { + /// u.footer("Note: Bad Bot!") + /// .disallow("/") + /// .allow("/bad/bot.txt") + /// }); + /// ``` + pub fn footer(mut self, footer: &str) -> Self { + self.footer = Some(footer.to_string()); + self + } +} + +impl<'ua> FromIterator<&'ua str> for GroupBuilder { + fn from_iter>(iter: T) -> Self { + let uas = iter.into_iter().map(|ua| ua.trim().to_string()); + Self { + user_agents: uas.collect(), + ..Self::default() + } + } +} + +impl Display for GroupBuilder { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let header = self.header.as_ref().map(|h| format_comment(h)); + let footer = self.footer.as_ref().map(|f| format_comment(f)); + let delay = self.delay.map(|d| format!("Crawl-Delay: {d}")); + + let agents = if self.user_agents.is_empty() { + Some("User-Agent: *".to_string()) + } else { + let uas = self.user_agents.iter(); + let uas = uas.map(|ua| format!("User-Agent: {ua}")); + Some(uas.collect::>().join("\n")) + }; + + let disallows = if self.rules_disallow.is_empty() { + None + } else { + let rd = self.rules_disallow.iter(); + let rd = rd.map(|r| format!("Disallow: {r}")); + Some(rd.collect::>().join("\n")) + }; + + let allows = if self.rules_allow.is_empty() { + // Explicit Allow: * if no Disallows. + // Used to interrupt the user-group i.e. + // user-agent: a ..no rules.. user-agent: b + match self.rules_disallow.is_empty() { + true => Some("Allow: *".to_string()), + false => None, + } + } else { + let rd = self.rules_allow.iter(); + let rd = rd.map(|r| format!("Allow: {r}")); + Some(rd.collect::>().join("\n")) + }; + + let result = [header, agents, delay, disallows, allows, footer]; + let result = result.iter().filter_map(|u| u.clone()); + let result = result.collect::>().join("\n"); + write!(f, "{}", result.as_str()) + } +} + +#[cfg(test)] +mod builder { + use super::*; + + #[test] + fn empty_uas() { + let r = GroupBuilder::new().disallow("/foo").to_string(); + assert!(r.contains("User-Agent: *")); + } + + #[test] + fn no_rules() { + let r = GroupBuilder::from_iter(["foobot"]).to_string(); + assert!(r.contains("Allow: *")); + } +} diff --git a/robotxt/build/mod.rs b/robotxt/build/mod.rs new file mode 100644 index 0000000..e2a3046 --- /dev/null +++ b/robotxt/build/mod.rs @@ -0,0 +1,146 @@ +use std::collections::HashSet; +use std::fmt; + +use url::Url; + +pub use crate::build::group::GroupBuilder; +use crate::build::split::format_comment; + +mod group; +mod split; + +/// The set of formatted `user-agent` groups that can be written +/// in the `robots.txt` compliant format. +#[derive(Debug, Default, Clone)] +pub struct RobotsBuilder { + groups: Vec, + sitemaps: HashSet, + header: Option, + footer: Option, +} + +impl RobotsBuilder { + /// Creates a new [`RobotsBuilder`] with default settings. + pub fn new() -> Self { + Self::default() + } + + /// Adds a global header, usually used for permissions or legal notices. + /// + /// ``` + /// use robotxt::RobotsBuilder; + /// + /// let txt = RobotsBuilder::default() + /// .header("Note: Stop right there!") + /// .group(["*"], |u| u.disallow("/")) + /// .group(["foobot"], |u| u.allow("/")); + /// ``` + pub fn header(mut self, header: &str) -> Self { + self.header = Some(header.to_string()); + self + } + + /// Adds a new `user-agent` group from the provided list of user-agents. + /// + /// ``` + /// use robotxt::RobotsBuilder; + /// + /// let txt = RobotsBuilder::default() + /// .group(["*"], |u| u.disallow("/")) + /// .group(["foobot"], |u| u.allow("/")); + /// ``` + pub fn group<'a>( + mut self, + group: impl IntoIterator, + factory: impl FnOnce(GroupBuilder) -> GroupBuilder, + ) -> Self { + let section = GroupBuilder::from_iter(group); + self.groups.push(factory(section)); + self + } + + /// Adds the `Sitemap` directive from the URL address. + /// + /// ``` + /// use url::Url; + /// use robotxt::RobotsBuilder; + /// + /// let txt = RobotsBuilder::default() + /// .sitemap("https://example.com/sitemap_1.xml".try_into().unwrap()) + /// .sitemap("https://example.com/sitemap_1.xml".try_into().unwrap()); + /// ``` + pub fn sitemap(mut self, sitemap: Url) -> Self { + self.sitemaps.insert(sitemap); + self + } + + /// Adds a global footer, usually used for notices. + /// + /// ``` + /// use robotxt::RobotsBuilder; + /// + /// let txt = RobotsBuilder::default() + /// .group(["*"], |u| u.disallow("/")) + /// .group(["foobot"], |u| u.allow("/")) + /// .footer("Note: Have a nice day!"); + /// ``` + pub fn footer(mut self, footer: &str) -> Self { + self.footer = Some(footer.to_string()); + self + } + + /// Parses the constructed output. + /// See [`Robots::from_bytes`]. + /// + /// [`Robots::from_bytes`]: crate::Robots::from_bytes + #[cfg(feature = "parser")] + #[cfg_attr(docsrs, doc(cfg(feature = "parser")))] + pub fn parse(&self, user_agent: &str) -> crate::Robots { + let txt = self.to_string(); + crate::Robots::from_bytes(txt.as_bytes(), user_agent) + } +} + +impl fmt::Display for RobotsBuilder { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let header = self.header.as_ref().map(|h| format_comment(h)); + let footer = self.footer.as_ref().map(|f| format_comment(f)); + + let groups = self.groups.iter().map(|u| u.to_string()); + let groups = groups.collect::>().join("\n\n"); + + let result = [header, Some(groups), footer]; + let result = result.iter().filter_map(|u| u.clone()); + let result = result.collect::>().join("\n\n"); + write!(f, "{}", result.as_str()) + } +} + +#[cfg(test)] +mod builder { + use crate::{Result, RobotsBuilder}; + + #[test] + fn readme() -> Result<()> { + let txt = RobotsBuilder::default() + .header("Robots.txt: Start") + .group(["foobot"], |u| { + u.crawl_delay(5) + .header("Rules for Foobot: Start") + .allow("/example/yeah.txt") + .disallow("/example/nope.txt") + .footer("Rules for Foobot: End") + }) + .group(["barbot", "nombot"], |u| { + u.crawl_delay(2) + .disallow("/example/yeah.txt") + .disallow("/example/nope.txt") + }) + .sitemap("https://example.com/sitemap_1.xml".try_into()?) + .sitemap("https://example.com/sitemap_2.xml".try_into()?) + .footer("Robots.txt: End"); + + println!("{}", txt.to_string()); + Ok(()) + } +} diff --git a/robotxt/build/split.rs b/robotxt/build/split.rs new file mode 100644 index 0000000..0bc57ee --- /dev/null +++ b/robotxt/build/split.rs @@ -0,0 +1,15 @@ +/// Splits multiline comments into lines and prefixes them with `#`. +pub fn format_comment(txt: &str) -> String { + txt.lines() + .map(|txt| txt.trim()) + .filter(|txt| !txt.is_empty()) + .map(|txt| { + if txt.starts_with('#') { + txt.to_owned() + } else { + format!("# {txt}") + } + }) + .collect::>() + .join("\n") +} diff --git a/sitemapo/build/auto.rs b/sitemapo/build/auto.rs new file mode 100644 index 0000000..c85dce1 --- /dev/null +++ b/sitemapo/build/auto.rs @@ -0,0 +1,101 @@ +use url::Url; + +use crate::build::{EntryBuilder, IndexBuilder}; +use crate::record::Entry; +use crate::Error; + +/// TODO: Desc. +/// +/// Automatic sitemap file constructor. +/// NOTE: Does not deduplicate records. +/// +/// ```rust +/// #[derive(Debug, thiserror::Error)] +/// enum CustomError { +/// // .. +/// #[error("sitemap error: {0}")] +/// Sitemap(#[from] sitemapo::Error), +/// //.. +/// } +/// +/// fn main() -> Result<(), CustomError> { +/// Ok(()) +/// } +/// ``` +pub struct AutoBuilder { + index: Option>, + entry: Vec>, + queue: Vec, + // factory: impl Fn() -> W, +} + +impl AutoBuilder { + /// TODO: Desc. + pub fn new() -> Self { + todo!() + } +} + +impl AutoBuilder +where + W: std::io::Write, +{ + /// TODO: Desc. + pub fn try_sync(&mut self, fetcher: A) -> Result<(), E> + where + E: std::error::Error + From, + A: Fn(Url) -> Result, E>, + { + // if let Some(builder) = self.entry.as_mut() { + // builder.write(record) + // } + + todo!() + } +} + +#[cfg(feature = "tokio")] +#[cfg_attr(docsrs, doc(cfg(feature = "tokio")))] +impl AutoBuilder +where + W: tokio::io::AsyncWrite + Unpin + Send, +{ + /// TODO: Desc. + pub async fn try_async(&mut self) -> Result<(), Error> { + todo!() + } +} + +impl std::fmt::Debug for AutoBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // TODO: Debug. + f.debug_struct("AutoBuilder").finish() + } +} + +// impl Default for AutoBuilder { +// fn default() -> Self { +// Self { +// entry: None, +// index: None, +// } +// } +// } + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn sync() -> Result<(), Error> { + // TODO: Test. + Ok(()) + } + + #[cfg(feature = "tokio")] + #[tokio::test] + async fn asynk() -> Result<(), Error> { + // TODO: Test. + Ok(()) + } +} diff --git a/sitemapo/build/entry.rs b/sitemapo/build/entry.rs new file mode 100644 index 0000000..46508a5 --- /dev/null +++ b/sitemapo/build/entry.rs @@ -0,0 +1,268 @@ +use std::io::Write; + +use quick_xml::{events, Writer}; +use time::format_description::well_known::Iso8601; + +use crate::build::{Builder, InnerBuilder, CONFIG}; +use crate::record::*; +use crate::{Error, Result}; + +/// Sitemap builder for the versatile XML file with an optional support of extensions. +/// +/// For example: +/// +/// ```xml +/// +/// +/// +/// https://www.example.com/foo.html +/// 2022-06-04 +/// +/// +/// ``` +/// +/// Enforces total written/read bytes and total records limits. +/// See [Error]. +/// +/// ```rust +/// use sitemapo::build::{Builder, EntryBuilder}; +/// use sitemapo::record::Entry; +/// +/// fn main() -> sitemapo::Result<()> { +/// let buf = Vec::new(); +/// let url = "https://example.com/".try_into().unwrap(); +/// let rec = Entry::new(url); +/// +/// let mut builder = EntryBuilder::new(buf)?; +/// builder.write(&rec)?; +/// let _buf = builder.close()?; +/// Ok(()) +/// } +/// ``` +pub struct EntryBuilder { + inner: InnerBuilder, +} + +impl EntryBuilder { + /// Creates a new instance with the given writer. + pub(crate) fn from_writer(writer: W) -> Self { + let inner = InnerBuilder::from_writer(writer); + Self::from_inner(inner) + } + + /// Creates a new instance with the given inner parser. + pub(crate) fn from_inner(inner: InnerBuilder) -> Self { + Self { inner } + } + + /// Returns a reference to the underlying writer. + pub fn get_ref(&self) -> &W { + self.inner.get_ref() + } + + /// Returns a mutable reference to the underlying writer. + pub fn get_mut(&mut self) -> &mut W { + self.inner.get_mut() + } + + /// Returns an underlying writer. + pub fn into_inner(self) -> W { + self.inner.into_inner() + } + + pub(crate) fn create_entry_open(&mut self) -> Result> { + self.inner.create_open_tag(URL_SET) + } + + pub(crate) fn create_entry_record(&mut self, record: &Entry) -> Result> { + if self.inner.records + 1 > RECORD_LIMIT { + return Err(Error::EntryLimit { over: 1 }); + } + + let format = &Iso8601::<{ CONFIG }>; + let location = record.location.to_string(); + let modified = record.modified.map(|u| u.format(format).unwrap()); + let priority = record.priority.map(|u| u.to_string()); + let frequency = record.frequency.map(|u| u.to_string()); + + let mut temp = Writer::new(Vec::new()); + let element = temp.create_element(URL); + element.write_inner_content(|writer| -> quick_xml::Result<()> { + let tag = writer.create_element(LOCATION); + tag.write_text_content(events::BytesText::new(&location))?; + + if let Some(modified) = modified { + let tag = writer.create_element(LAST_MODIFIED); + tag.write_text_content(events::BytesText::new(&modified))?; + } + + if let Some(priority) = priority { + let tag = writer.create_element(PRIORITY); + tag.write_text_content(events::BytesText::new(&priority))?; + } + + if let Some(frequency) = frequency { + let tag = writer.create_element(CHANGE_FREQUENCY); + tag.write_text_content(events::BytesText::new(&frequency))?; + } + + Ok(()) + })?; + + let buf = temp.into_inner(); + if buf.len() > BYTE_LIMIT { + let over_limit = buf.len() - BYTE_LIMIT; + return Err(Error::ByteLimit { over: over_limit }); + } + + Ok(buf) + } + + pub(crate) fn create_entry_close(&mut self) -> Result> { + self.inner.create_close_tag(URL_SET) + } +} + +impl std::fmt::Debug for EntryBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("EntryBuilder") + .field("inner", &self.inner) + .finish() + } +} + +impl Builder for EntryBuilder { + type Error = Error; + + fn new(writer: W) -> Result { + let mut this = Self::from_writer(writer); + let temp = this.create_entry_open()?; + this.inner.writer.write_all(&temp)?; + Ok(this) + } + + fn write(&mut self, record: &Entry) -> Result<()> { + let temp = self.create_entry_record(record)?; + self.inner.writer.write_all(&temp)?; + self.inner.records += 1; + Ok(()) + } + + fn close(mut self) -> Result { + let temp = self.create_entry_close()?; + self.inner.writer.write_all(&temp)?; + Ok(self.into_inner()) + } +} + +#[cfg(feature = "tokio")] +#[cfg_attr(docsrs, doc(cfg(feature = "tokio")))] +mod tokio { + use async_trait::async_trait; + use tokio::io::{AsyncWrite, AsyncWriteExt}; + + use crate::build::{AsyncBuilder, EntryBuilder}; + use crate::record::Entry; + use crate::{Error, Result}; + + #[async_trait] + impl AsyncBuilder for EntryBuilder { + type Error = Error; + + async fn new(writer: W) -> Result { + let mut this = Self::from_writer(writer); + let temp = this.create_entry_open()?; + this.inner.writer.write_all(&temp).await?; + Ok(this) + } + + async fn write(&mut self, record: &Entry) -> Result<()> { + let temp = self.create_entry_record(record)?; + self.inner.writer.write_all(&temp).await?; + self.inner.records += 1; + Ok(()) + } + + async fn close(mut self) -> Result { + let temp = self.create_entry_close()?; + self.inner.writer.write_all(&temp).await?; + Ok(self.into_inner()) + } + } +} + +#[cfg(test)] +mod test { + use std::io::BufWriter; + + use url::Url; + + use crate::build::{Builder, EntryBuilder}; + use crate::record::Entry; + use crate::Result; + + #[test] + fn synk() -> Result<()> { + let buf = Vec::new(); + let mut builder = EntryBuilder::new(buf)?; + + let url = Url::parse("https://example.com/").unwrap(); + let rec = Entry::new(url); + builder.write(&rec)?; + let _buf = builder.close()?; + + Ok(()) + } + + #[test] + fn synk_with_buf() -> Result<()> { + let buf = BufWriter::new(Vec::new()); + let mut builder = EntryBuilder::new(buf)?; + + let url = Url::parse("https://example.com/").unwrap(); + let rec = Entry::new(url); + builder.write(&rec)?; + let _buf = builder.close()?; + + Ok(()) + } +} + +#[cfg(feature = "tokio")] +#[cfg(test)] +mod tokio_test { + use tokio::io::{AsyncWriteExt, BufWriter}; + use url::Url; + + use crate::build::{AsyncBuilder, EntryBuilder}; + use crate::{record::Entry, Result}; + + #[tokio::test] + async fn asynk() -> Result<()> { + let buf = Vec::new(); + let mut builder = EntryBuilder::new(buf).await?; + + let url = Url::parse("https://example.com/").unwrap(); + let rec = Entry::new(url); + builder.write(&rec).await?; + let _buf = builder.close().await?; + + Ok(()) + } + + #[tokio::test] + async fn asynk_with_buf() -> Result<()> { + let buf = BufWriter::new(Vec::new()); + let mut builder = EntryBuilder::new(buf).await?; + + let url = Url::parse("https://example.com/").unwrap(); + + let rec = Entry::new(url); + builder.write(&rec).await?; + let mut buf = builder.close().await?; + + let _ = buf.flush().await?; + + Ok(()) + } +} diff --git a/sitemapo/build/index.rs b/sitemapo/build/index.rs new file mode 100644 index 0000000..10222b1 --- /dev/null +++ b/sitemapo/build/index.rs @@ -0,0 +1,180 @@ +use std::io::Write; + +use quick_xml::{events, Writer}; +use time::format_description::well_known::Iso8601; + +use crate::build::{Builder, InnerBuilder, CONFIG}; +use crate::record::*; +use crate::{Error, Result}; + +/// Sitemap index parser for the versatile XML file. +/// +/// For example: +/// +/// ```xml +/// +/// +/// +/// http://www.example.com/sitemap.xml.gz +/// 2004-10-01T18:23:17+00:00 +/// +/// +/// ``` +/// +/// Enforces total written/read bytes and total records limits. +/// See [Error]. +/// +/// ```rust +/// use sitemapo::build::{Builder, IndexBuilder}; +/// use sitemapo::record::Index; +/// +/// fn main() -> sitemapo::Result<()> { +/// let buf = Vec::new(); +/// let url = "https://example.com/".try_into().unwrap(); +/// let rec = Index::new(url); +/// +/// let mut builder = IndexBuilder::new(buf)?; +/// builder.write(&rec)?; +/// let _buf = builder.close()?; +/// Ok(()) +/// } +/// ``` +pub struct IndexBuilder { + inner: InnerBuilder, +} + +impl IndexBuilder { + /// Creates a new instance with the given writer. + pub(crate) fn from_writer(writer: W) -> Self { + let inner = InnerBuilder::from_writer(writer); + Self::from_inner(inner) + } + + /// Creates a new instance with the given inner parser. + pub(crate) fn from_inner(inner: InnerBuilder) -> Self { + Self { inner } + } + + /// Returns a reference to the underlying writer. + pub fn get_ref(&self) -> &W { + self.inner.get_ref() + } + + /// Returns a mutable reference to the underlying writer. + pub fn get_mut(&mut self) -> &mut W { + self.inner.get_mut() + } + + /// Returns an underlying writer. + pub fn into_inner(self) -> W { + self.inner.into_inner() + } + + pub(crate) fn create_index_open(&mut self) -> Result> { + self.inner.create_open_tag(SITEMAP_INDEX) + } + + pub(crate) fn create_index_record(&mut self, record: &Index) -> Result> { + if self.inner.records + 1 > RECORD_LIMIT { + return Err(Error::EntryLimit { over: 1 }); + } + + let format = &Iso8601::<{ CONFIG }>; + let location = record.location.to_string(); + let modified = record.modified.map(|u| u.format(format).unwrap()); + + let mut temp = Writer::new(Vec::new()); + let element = temp.create_element(SITEMAP); + element.write_inner_content(|writer| -> quick_xml::Result<()> { + let tag = writer.create_element(LOCATION); + tag.write_text_content(events::BytesText::new(&location))?; + + if let Some(modified) = modified { + let tag = writer.create_element(LAST_MODIFIED); + tag.write_text_content(events::BytesText::new(&modified))?; + } + + Ok(()) + })?; + + let buf = temp.into_inner(); + if buf.len() > BYTE_LIMIT { + let over_limit = buf.len() - BYTE_LIMIT; + return Err(Error::ByteLimit { over: over_limit }); + } + + Ok(buf) + } + + pub(crate) fn create_index_close(&mut self) -> Result> { + self.inner.create_close_tag(SITEMAP_INDEX) + } +} + +impl std::fmt::Debug for IndexBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("IndexBuilder") + .field("inner", &self.inner) + .finish() + } +} + +impl Builder for IndexBuilder { + type Error = Error; + + fn new(writer: W) -> Result { + let mut this = Self::from_writer(writer); + let temp = this.create_index_open()?; + this.inner.writer.write_all(&temp)?; + Ok(this) + } + + fn write(&mut self, record: &Index) -> Result<()> { + let temp = self.create_index_record(record)?; + self.inner.writer.write_all(&temp)?; + self.inner.records += 1; + Ok(()) + } + + fn close(mut self) -> Result { + let temp = self.create_index_close()?; + self.inner.writer.write_all(&temp)?; + Ok(self.into_inner()) + } +} + +#[cfg(feature = "tokio")] +#[cfg_attr(docsrs, doc(cfg(feature = "tokio")))] +mod tokio { + use async_trait::async_trait; + use tokio::io::{AsyncWrite, AsyncWriteExt}; + + use crate::build::{AsyncBuilder, IndexBuilder}; + use crate::record::Index; + use crate::{Error, Result}; + + #[async_trait] + impl AsyncBuilder for IndexBuilder { + type Error = Error; + + async fn new(writer: W) -> Result { + let mut this = Self::from_writer(writer); + let temp = this.create_index_open()?; + this.inner.writer.write_all(&temp).await?; + Ok(this) + } + + async fn write(&mut self, record: &Index) -> Result<()> { + let temp = self.create_index_record(record)?; + self.inner.writer.write_all(&temp).await?; + self.inner.records += 1; + Ok(()) + } + + async fn close(mut self) -> Result { + let temp = self.create_index_close()?; + self.inner.writer.write_all(&temp).await?; + Ok(self.into_inner()) + } + } +} diff --git a/sitemapo/build/inner.rs b/sitemapo/build/inner.rs new file mode 100644 index 0000000..42dea65 --- /dev/null +++ b/sitemapo/build/inner.rs @@ -0,0 +1,84 @@ +use std::{marker::PhantomData, num::NonZeroU8}; + +use countio::Counter; +use quick_xml::{events, Writer}; +use time::format_description::well_known::iso8601; + +use crate::Error; + +pub(crate) const CONFIG: iso8601::EncodedConfig = iso8601::Config::DEFAULT + .set_time_precision(iso8601::TimePrecision::Second { + decimal_digits: NonZeroU8::new(2), + }) + .encode(); + +pub(crate) struct InnerBuilder { + pub(crate) record: PhantomData, + pub(crate) writer: Counter, + pub(crate) records: usize, +} + +impl InnerBuilder { + /// Creates a new instance with a provided writer. + pub fn from_writer(writer: W) -> Self { + Self { + record: PhantomData, + writer: Counter::new(writer), + records: 0, + } + } + + /// Returns a reference to the underlying writer. + pub fn get_ref(&self) -> &W { + self.writer.get_ref() + } + + /// Returns a mutable reference to the underlying writer. + pub fn get_mut(&mut self) -> &mut W { + self.writer.get_mut() + } + + /// Returns an underlying writer. + pub fn into_inner(self) -> W { + self.writer.into_inner() + } + + pub fn create_open_tag(&mut self, tag: &str) -> Result, Error> { + let mut temp = Writer::new(Vec::new()); + temp.write_bom()?; + + // + let decl = events::BytesDecl::new("1.0", Some("UTF-8"), None); + temp.write_event(events::Event::Decl(decl))?; + + // + // + const XMLNS: [(&str, &str); 1] = [("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")]; + + let tag = events::BytesStart::new(tag); + let tag = tag.with_attributes(XMLNS); + temp.write_event(events::Event::Start(tag))?; + + Ok(temp.into_inner()) + } + + pub fn create_close_tag(&mut self, tag: &str) -> Result, Error> { + let mut temp = Writer::new(Vec::new()); + + // + // + let tag = events::BytesEnd::new(tag); + temp.write_event(events::Event::End(tag))?; + + Ok(temp.into_inner()) + } +} + +impl std::fmt::Debug for InnerBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("XmlBuilder") + .field("bytes", &self.writer.writer_bytes()) + .field("records", &self.records) + .finish() + } +} diff --git a/sitemapo/build/mod.rs b/sitemapo/build/mod.rs new file mode 100644 index 0000000..9a60e67 --- /dev/null +++ b/sitemapo/build/mod.rs @@ -0,0 +1,44 @@ +mod auto; +mod entry; +mod index; +mod inner; +mod plain; + +pub use auto::*; +pub use entry::*; +pub use index::*; +pub(crate) use inner::*; +pub use plain::*; + +// TODO: Make builders take BufWrite. + +/// Core trait for the builder implementation. +pub trait Builder: Sized { + type Error: std::error::Error; + + // Creates a new `Builder` instance. + fn new(writer: W) -> Result; + + /// Writes another record into the underlying writer. + fn write(&mut self, record: &D) -> Result<(), Self::Error>; + + /// Closes tags if needed and releases the writer. + fn close(self) -> Result; +} + +/// Core trait for the async builder implementation. +#[cfg(feature = "tokio")] +#[cfg_attr(docsrs, doc(cfg(feature = "tokio")))] +#[async_trait::async_trait] +pub trait AsyncBuilder: Sized { + type Error: std::error::Error; + + // Creates a new `AsyncBuilder` instance. + async fn new(writer: W) -> Result; + + /// Writes another record into the underlying writer. + async fn write(&mut self, record: &D) -> Result<(), Self::Error>; + + /// Closes tags if needed and releases the writer. + async fn close(self) -> Result; +} diff --git a/sitemapo/build/plain.rs b/sitemapo/build/plain.rs new file mode 100644 index 0000000..1b82a75 --- /dev/null +++ b/sitemapo/build/plain.rs @@ -0,0 +1,224 @@ +use std::io::Write; + +use countio::Counter; +use url::Url; + +use crate::build::Builder; +use crate::record::*; +use crate::{Error, Result}; + +/// Sitemap builder for the simple TXT file that contains one URL per line. +/// +/// For example: +/// +/// ```txt +/// https://www.example.com/file1.html +/// https://www.example.com/file2.html +/// ``` +/// +/// Enforces [total written/read bytes](BYTE_LIMIT) and [total records](RECORD_LIMIT) limits. +/// See [Error]. +/// +/// ```rust +/// use sitemapo::build::{Builder, PlainBuilder}; +/// +/// fn main() -> sitemapo::Result<()> { +/// let buf = Vec::new(); +/// let rec = "https://example.com/".try_into().unwrap(); +/// +/// let mut builder = PlainBuilder::new(buf)?; +/// builder.write(&rec)?; +/// let _buf = builder.close()?; +/// Ok(()) +/// } +/// ``` +pub struct PlainBuilder { + writer: Counter, + records: usize, +} + +impl PlainBuilder { + /// Returns a reference to the underlying writer. + pub fn get_ref(&self) -> &W { + self.writer.get_ref() + } + + /// Returns a mutable reference to the underlying writer. + pub fn get_mut(&mut self) -> &mut W { + self.writer.get_mut() + } + + /// Returns an underlying writer. + pub fn into_inner(self) -> W { + self.writer.into_inner() + } +} + +impl PlainBuilder { + /// Creates a new instance with a provided writer. + pub(crate) fn from_writer(writer: W) -> Self { + Self { + writer: Counter::new(writer), + records: 0, + } + } + + pub(crate) fn create_next_line(&mut self, url: &Url) -> Result> { + const NEWLINE: &str = "\n"; + + if self.records + 1 > RECORD_LIMIT { + return Err(Error::EntryLimit { over: 1 }); + } + + let record = url.to_string(); + let record_bytes = record.len() + NEWLINE.len(); + let total_bytes = self.writer.writer_bytes() + record_bytes; + if total_bytes > BYTE_LIMIT { + let over_limit = total_bytes - BYTE_LIMIT; + return Err(Error::ByteLimit { over: over_limit }); + } + + Ok((record + NEWLINE).into_bytes()) + } +} + +impl Builder for PlainBuilder { + type Error = Error; + + fn new(writer: W) -> Result { + Ok(Self::from_writer(writer)) + } + + fn write(&mut self, record: &Url) -> Result<()> { + let record = self.create_next_line(record)?; + self.writer.write_all(&record)?; + self.records += 1; + Ok(()) + } + + fn close(self) -> Result { + Ok(self.into_inner()) + } +} + +impl std::fmt::Debug for PlainBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("TxtBuilder") + .field("bytes", &self.writer.writer_bytes()) + .field("records", &self.records) + .finish() + } +} + +#[cfg(feature = "tokio")] +#[cfg_attr(docsrs, doc(cfg(feature = "tokio")))] +mod tokio { + use async_trait::async_trait; + use tokio::io::{AsyncWrite, AsyncWriteExt}; + use url::Url; + + use crate::build::{AsyncBuilder, PlainBuilder}; + use crate::{Error, Result}; + + #[async_trait] + impl AsyncBuilder for PlainBuilder { + type Error = Error; + + async fn new(writer: W) -> Result { + Ok(Self::from_writer(writer)) + } + + async fn write(&mut self, record: &Url) -> Result<()> { + let record = self.create_next_line(record)?; + self.writer.write_all(&record).await?; + self.records += 1; + Ok(()) + } + + async fn close(self) -> Result { + Ok(self.into_inner()) + } + } +} + +#[cfg(test)] +mod test { + use std::io::BufWriter; + use url::Url; + + use crate::build::{Builder, PlainBuilder}; + use crate::Result; + + #[test] + fn synk() -> Result<()> { + let buf = Vec::new(); + let mut builder = PlainBuilder::new(buf).unwrap(); + + let url = Url::parse("https://example.com/").unwrap(); + builder.write(&url).unwrap(); + let buf = builder.close().unwrap(); + + let exp = String::from_utf8(buf).unwrap(); + assert_eq!(url.to_string() + "\n", exp); + + Ok(()) + } + + #[test] + fn synk_with_buf() -> Result<()> { + let buf = BufWriter::new(Vec::new()); + let mut builder = PlainBuilder::new(buf)?; + + let url = Url::parse("https://example.com/").unwrap(); + builder.write(&url)?; + let buf = builder.close()?; + + let buf = buf.into_inner().unwrap(); + let exp = String::from_utf8(buf).unwrap(); + assert_eq!(url.to_string() + "\n", exp); + + Ok(()) + } +} + +#[cfg(feature = "tokio")] +#[cfg(test)] +mod tokio_test { + use tokio::io::{AsyncWriteExt, BufWriter}; + use url::Url; + + use crate::build::{AsyncBuilder, PlainBuilder}; + use crate::Result; + + #[tokio::test] + async fn asynk() -> Result<()> { + let buf = Vec::new(); + let mut builder = PlainBuilder::new(buf).await?; + + let url = Url::parse("https://example.com/").unwrap(); + builder.write(&url).await?; + let buf = builder.close().await?; + + let exp = String::from_utf8(buf); + assert_eq!(Ok(url.to_string() + "\n"), exp); + + Ok(()) + } + + #[tokio::test] + async fn asynk_with_buf() -> Result<()> { + let buf = BufWriter::new(Vec::new()); + let mut builder = PlainBuilder::new(buf).await?; + + let url = Url::parse("https://example.com/").unwrap(); + builder.write(&url).await?; + let mut buf = builder.close().await?; + + let _ = buf.flush().await?; + let buf = buf.into_inner(); + let exp = String::from_utf8(buf); + assert_eq!(Ok(url.to_string() + "\n"), exp); + + Ok(()) + } +} From f0d8a5905a16890d83ecae9819d6e2c438ed1a5d Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Sun, 14 Apr 2024 01:20:44 +0200 Subject: [PATCH 04/11] fix(coutio): inline, typos --- countio/counter/futures.rs | 2 +- countio/counter/mod.rs | 21 ++++++++++++++++----- countio/counter/stdlib.rs | 2 +- countio/counter/tokio.rs | 4 ++-- countio/lib.rs | 2 +- 5 files changed, 21 insertions(+), 10 deletions(-) diff --git a/countio/counter/futures.rs b/countio/counter/futures.rs index e22590f..64b0f8a 100644 --- a/countio/counter/futures.rs +++ b/countio/counter/futures.rs @@ -17,7 +17,7 @@ impl AsyncRead for Counter { let pin = Pin::new(&mut counter.inner); let poll = pin.poll_read(ctx, buf); if let Poll::Ready(Ok(bytes)) = poll { - counter.reader_bytes += bytes + counter.reader_bytes += bytes; } poll diff --git a/countio/counter/mod.rs b/countio/counter/mod.rs index b3cfc0b..04de808 100644 --- a/countio/counter/mod.rs +++ b/countio/counter/mod.rs @@ -13,18 +13,22 @@ mod tokio; /// The `Counter` struct adds byte counting to any reader or writer. pub struct Counter { pub(crate) inner: D, + /// Total bytes read from the `inner` reader. pub(crate) reader_bytes: usize, + /// Total bytes written to the `inner` writer. pub(crate) writer_bytes: usize, } impl Counter { /// Creates a new `Counter` with zero read/written bytes. - pub fn new(inner: D) -> Self { + #[inline] + pub const fn new(inner: D) -> Self { Self::with_bytes(0, 0, inner) } /// Creates a new `Counter` with the specified read/written bytes. - pub fn with_bytes(reader_bytes: usize, writer_bytes: usize, inner: D) -> Self { + #[inline] + pub const fn with_bytes(reader_bytes: usize, writer_bytes: usize, inner: D) -> Self { Self { inner, reader_bytes, @@ -33,37 +37,44 @@ impl Counter { } /// Returns the sum of read and written bytes by the underlying reader/writer. - pub fn total_bytes(&self) -> usize { + #[inline] + pub const fn total_bytes(&self) -> usize { self.reader_bytes + self.writer_bytes } /// Returns the total amount of read bytes by the underlying reader. - pub fn reader_bytes(&self) -> usize { + #[inline] + pub const fn reader_bytes(&self) -> usize { self.reader_bytes } /// Returns the total amount of written bytes by the underlying writer. - pub fn writer_bytes(&self) -> usize { + #[inline] + pub const fn writer_bytes(&self) -> usize { self.writer_bytes } /// Consumes `Counter` returning the underlying reader/writer. + #[inline] pub fn into_inner(self) -> D { self.inner } /// Gets a reference to the underlying reader/writer. + #[inline] pub fn get_ref(&self) -> &D { &self.inner } /// Gets a mutable reference to the underlying reader/writer. + #[inline] pub fn get_mut(&mut self) -> &mut D { &mut self.inner } } impl From for Counter { + #[inline] fn from(inner: D) -> Self { Self::new(inner) } diff --git a/countio/counter/stdlib.rs b/countio/counter/stdlib.rs index dde59fa..adfc8e3 100644 --- a/countio/counter/stdlib.rs +++ b/countio/counter/stdlib.rs @@ -19,7 +19,7 @@ impl BufRead for Counter { fn consume(&mut self, amt: usize) { self.reader_bytes += amt; - self.inner.consume(amt) + self.inner.consume(amt); } } diff --git a/countio/counter/tokio.rs b/countio/counter/tokio.rs index 17e74b5..6e97336 100644 --- a/countio/counter/tokio.rs +++ b/countio/counter/tokio.rs @@ -20,8 +20,8 @@ impl AsyncRead for Counter { let poll = pin.poll_read(ctx, buf); let bytes = buf.filled().len() - bytes; - if let Poll::Ready(Ok(())) = poll { - counter.reader_bytes += bytes + if matches!(poll, Poll::Ready(Ok(()))) { + counter.reader_bytes += bytes; } poll diff --git a/countio/lib.rs b/countio/lib.rs index a15f8ef..0f66732 100644 --- a/countio/lib.rs +++ b/countio/lib.rs @@ -8,5 +8,5 @@ mod counter; #[doc(hidden)] pub mod prelude { - pub use super::counter::Counter; + pub use super::Counter; } From 0072577e1d944f407d334284924b45881ffd4046 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Sun, 14 Apr 2024 01:21:03 +0200 Subject: [PATCH 05/11] fix(ci): update node --- .github/workflows/build.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 9e9f49a..54ee07d 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -17,7 +17,7 @@ jobs: steps: - name: Check out - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up ${{ matrix.toolchain }} Rust uses: dtolnay/rust-toolchain@master @@ -25,7 +25,7 @@ jobs: toolchain: ${{ matrix.toolchain }} - name: Set up Cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: | ~/.cargo/bin/ From 2c9313d44848fd4c8f5157f31b5015e8b5633a18 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Sun, 14 Apr 2024 01:39:57 +0200 Subject: [PATCH 06/11] fix(robotxt): typos, attr, deps --- robotxt/Cargo.toml | 2 +- robotxt/README.md | 4 ++-- robotxt/build/group.rs | 3 ++- robotxt/build/mod.rs | 3 ++- robotxt/build/split.rs | 2 +- robotxt/lib.rs | 15 ++++++++------- robotxt/parse/access.rs | 9 ++++++--- robotxt/parse/inner.rs | 27 ++++++++++++++++++-------- robotxt/parse/lexer.rs | 4 ++-- robotxt/parse/mod.rs | 42 ++++++++++++++++++++++++++++++----------- robotxt/parse/parser.rs | 4 ++-- robotxt/parse/rule.rs | 10 +++++----- robotxt/parse/serde.rs | 9 +++++---- robotxt/paths/normal.rs | 6 +++--- 14 files changed, 89 insertions(+), 51 deletions(-) diff --git a/robotxt/Cargo.toml b/robotxt/Cargo.toml index 4c26d77..6e09410 100644 --- a/robotxt/Cargo.toml +++ b/robotxt/Cargo.toml @@ -51,7 +51,7 @@ percent-encoding = { version = "2.3.1" } nom = { version = "7.1.3", optional = true } bstr = { version = "1.9.1", optional = true } -regex = { version = "1.10.3", optional = true } +regex = { version = "1.10.4", optional = true } serde = { workspace = true, optional = true } [dev-dependencies] diff --git a/robotxt/README.md b/robotxt/README.md index 290bdf6..ff24361 100644 --- a/robotxt/README.md +++ b/robotxt/README.md @@ -56,9 +56,9 @@ fn main() { - build the new `robots.txt` file in a declarative manner: ```rust -use robotxt::RobotsBuilder; +use robotxt::{RobotsBuilder, Result}; -fn main() -> Result<(), url::ParseError> { +fn main() -> Result<()> { let txt = RobotsBuilder::default() .header("Robots.txt: Start") .group(["foobot"], |u| { diff --git a/robotxt/build/group.rs b/robotxt/build/group.rs index 57f1e2d..220c704 100644 --- a/robotxt/build/group.rs +++ b/robotxt/build/group.rs @@ -6,7 +6,8 @@ use crate::paths::normalize_path; /// The single formatted `user-agent` group. /// -/// See [crate::RobotsBuilder::group]. +/// See [`crate::RobotsBuilder::group`]. +#[must_use] #[derive(Debug, Default, Clone)] pub struct GroupBuilder { user_agents: HashSet, diff --git a/robotxt/build/mod.rs b/robotxt/build/mod.rs index e2a3046..faeff95 100644 --- a/robotxt/build/mod.rs +++ b/robotxt/build/mod.rs @@ -11,6 +11,7 @@ mod split; /// The set of formatted `user-agent` groups that can be written /// in the `robots.txt` compliant format. +#[must_use] #[derive(Debug, Default, Clone)] pub struct RobotsBuilder { groups: Vec, @@ -110,7 +111,7 @@ impl fmt::Display for RobotsBuilder { let groups = groups.collect::>().join("\n\n"); let result = [header, Some(groups), footer]; - let result = result.iter().filter_map(|u| u.clone()); + let result = result.iter().filter_map(Clone::clone); let result = result.collect::>().join("\n\n"); write!(f, "{}", result.as_str()) } diff --git a/robotxt/build/split.rs b/robotxt/build/split.rs index 0bc57ee..d2b5e27 100644 --- a/robotxt/build/split.rs +++ b/robotxt/build/split.rs @@ -1,7 +1,7 @@ /// Splits multiline comments into lines and prefixes them with `#`. pub fn format_comment(txt: &str) -> String { txt.lines() - .map(|txt| txt.trim()) + .map(str::trim) .filter(|txt| !txt.is_empty()) .map(|txt| { if txt.starts_with('#') { diff --git a/robotxt/lib.rs b/robotxt/lib.rs index f73fd97..9378f18 100644 --- a/robotxt/lib.rs +++ b/robotxt/lib.rs @@ -8,8 +8,8 @@ pub use url; #[cfg(feature = "builder")] pub use build::{GroupBuilder, RobotsBuilder}; #[cfg(feature = "parser")] -pub use parse::{AccessResult, Robots, ALL_UAS}; -pub use paths::{create_url, BYTE_LIMIT}; +pub use parse::{AccessResult, ALL_UAS, Robots}; +pub use paths::{BYTE_LIMIT, create_url}; /// Unrecoverable failure during `robots.txt` building or parsing. /// @@ -29,6 +29,7 @@ pub enum Error { /// Unable to create the expected path to the `robots.txt` file: /// unexpected parsing error. + // TODO: Remove url::ParseError. #[error("url parsing error: {0}")] Url(#[from] url::ParseError), } @@ -37,7 +38,7 @@ pub enum Error { /// /// [`Result`]: std::result::Result /// [`robotxt`]: crate -pub type Result = std::result::Result; +pub type Result = std::result::Result; mod paths; @@ -51,10 +52,10 @@ mod parse; #[doc(hidden)] pub mod prelude { + pub use super::{Error, Result}; #[cfg(feature = "builder")] - pub use super::build::*; + pub use super::build::{GroupBuilder, RobotsBuilder}; #[cfg(feature = "parser")] - pub use super::parse::*; - pub use super::paths::*; - pub use super::{Error, Result}; + pub use super::parse::{AccessResult, ALL_UAS, Robots}; + pub use super::paths::{BYTE_LIMIT, create_url}; } diff --git a/robotxt/parse/access.rs b/robotxt/parse/access.rs index d95e999..acd64f8 100644 --- a/robotxt/parse/access.rs +++ b/robotxt/parse/access.rs @@ -1,3 +1,5 @@ +use std::fmt; + /// The result of the `robots.txt` retrieval attempt. /// /// See [`Robots::from_access`]. @@ -52,7 +54,8 @@ pub enum AccessResult<'a> { impl AccessResult<'_> { /// Returns the textual representation of a status. - pub fn as_str(&self) -> &'static str { + #[must_use] + pub const fn as_str(&self) -> &'static str { match self { AccessResult::Successful(_) => "Successful", AccessResult::Redirect => "Redirect", @@ -62,8 +65,8 @@ impl AccessResult<'_> { } } -impl std::fmt::Display for AccessResult<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl fmt::Display for AccessResult<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}", self.as_str()) } } diff --git a/robotxt/parse/inner.rs b/robotxt/parse/inner.rs index 6bbf2ae..dc26205 100644 --- a/robotxt/parse/inner.rs +++ b/robotxt/parse/inner.rs @@ -11,7 +11,7 @@ use crate::parse::rule::Rule; use crate::paths::normalize_path; use crate::BYTE_LIMIT; -/// The [`Rules`] enum determines if the [RobotsInner::is_allowed] results +/// The [`Rules`] enum determines if the [`RobotsInner::is_allowed`] results /// from the set of [`Rule`]s or the single provided global rule. #[derive(Debug, Clone, PartialEq, Eq)] pub(crate) enum Rules { @@ -20,7 +20,8 @@ pub(crate) enum Rules { } /// The [`RobotsInner`] struct provides convenient and efficient storage for -/// the data associated with certain user-agent for further matching. +/// the data associated with a specific `user-agent` for further matching. +#[must_use] #[derive(Debug, Clone, PartialEq, Eq)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct RobotsInner { @@ -63,11 +64,11 @@ impl RobotsInner { // TODO: Remove overlapping rules. #[cfg(feature = "optimal")] - if rules.is_empty() || rules.iter().all(|r| r.is_allowed()) { + if rules.is_empty() || rules.iter().all(Rule::is_allowed) { // Empty or all allow. return Rules::Always(true); } else if rules.iter().all(|r| !r.is_allowed()) - && rules.iter().rev().any(|r| r.is_universal()) + && rules.iter().rev().any(Rule::is_universal) { // All disallow + universal disallow. // Universal rule should be one of the smallest, so reverse the iter. @@ -89,6 +90,7 @@ impl RobotsInner { /// Returns `Some(true)` if there is an explicit `allow` or the global rule. /// NOTE: Expects relative path. + #[must_use] pub fn try_is_allowed(&self, path: &str) -> Option { match self.rules { Rules::Always(always) => Some(always), @@ -104,13 +106,15 @@ impl RobotsInner { /// Returns true if the relative path is allowed for this set of rules. /// NOTE: Expects relative path. + #[must_use] pub fn is_allowed(&self, path: &str) -> bool { // Returns true is there is no rule matching the path. self.try_is_allowed(path).unwrap_or(true) } /// Returns `Some(_)` if the rules fully allow or disallow. - pub fn is_always(&self) -> Option { + #[must_use] + pub const fn is_always(&self) -> Option { match &self.rules { Rules::Rules(_) => None, Rules::Always(always) => Some(*always), @@ -118,22 +122,26 @@ impl RobotsInner { } /// Returns the longest matching user-agent. + #[must_use] pub fn user_agent(&self) -> &str { self.user_agent.as_ref() } /// Returns the specified crawl-delay. - pub fn crawl_delay(&self) -> Option { + #[must_use] + pub const fn crawl_delay(&self) -> Option { self.crawl_delay } /// Returns all collected sitemaps. + #[must_use] pub fn sitemaps(&self) -> &[Url] { self.sitemaps.as_slice() } /// Returns the total amount of applied rules unless constructed /// with (or optimized to) the global rule. + #[must_use] pub fn len(&self) -> Option { match &self.rules { Rules::Rules(vec) => Some(vec.len()), @@ -143,6 +151,7 @@ impl RobotsInner { /// Returns true if there are no applied rules i.e. it is constructed /// with (or optimized to) the global rule. + #[must_use] pub fn is_empty(&self) -> Option { self.len().map(|len| len == 0) } @@ -151,9 +160,10 @@ impl RobotsInner { #[cfg(test)] #[cfg(feature = "optimal")] mod optimal_output { - use super::*; use crate::ALL_UAS; + use super::*; + #[test] fn from() { let r = RobotsInner::from_always(true, None, "foo"); @@ -192,9 +202,10 @@ mod optimal_output { #[cfg(test)] mod precedence_rules { - use super::*; use crate::ALL_UAS; + use super::*; + #[test] fn simple() { let t = b"Allow: /p \n Disallow: /"; diff --git a/robotxt/parse/lexer.rs b/robotxt/parse/lexer.rs index 7cf8e03..f8393e8 100644 --- a/robotxt/parse/lexer.rs +++ b/robotxt/parse/lexer.rs @@ -42,13 +42,13 @@ const NEWLINE: u8 = b'\n'; const COMMENT: u8 = b'#'; /// Returns true if the character code is neither a newline nor a carriage return. -fn not_line_ending(c: u8) -> bool { +const fn not_line_ending(c: u8) -> bool { c != NEWLINE && c != CARRIAGE } /// Returns true if the character code is neither a newline, a carriage return, /// nor a comment character. -fn not_line_ending_or_comment(c: u8) -> bool { +const fn not_line_ending_or_comment(c: u8) -> bool { c != NEWLINE && c != CARRIAGE && c != COMMENT } diff --git a/robotxt/parse/mod.rs b/robotxt/parse/mod.rs index 1124186..013339d 100644 --- a/robotxt/parse/mod.rs +++ b/robotxt/parse/mod.rs @@ -1,20 +1,22 @@ -use std::io::{BufReader, Read}; +use std::io::{self, BufReader, Read}; use std::sync::Arc; +use std::time::Duration; +#[cfg(feature = "serde")] +use ::serde::{Deserialize, Serialize}; use url::Url; -use crate::BYTE_LIMIT; pub use access::AccessResult; use inner::RobotsInner; +use crate::BYTE_LIMIT; + mod access; mod inner; mod lexer; mod parser; mod rule; -#[cfg(feature = "serde")] -use ::serde::{Deserialize, Serialize}; #[cfg(feature = "serde")] mod serde; @@ -62,6 +64,7 @@ pub const ALL_UAS: &str = "*"; /// assert!(!r.is_relative_allowed("/example/nope.txt")); /// assert!(!r.is_relative_allowed("/invalid/path.txt")); /// ``` +#[must_use] #[derive(Debug, Clone, PartialEq, Eq)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct Robots { @@ -87,6 +90,7 @@ impl Robots { /// assert!(!r.is_relative_allowed("/example/nope.txt")); /// assert!(!r.is_relative_allowed("/invalid/path.txt")); /// ``` + pub fn from_bytes(robots: &[u8], user_agent: &str) -> Self { let inner = RobotsInner::from_bytes(robots, user_agent); Self { @@ -113,7 +117,7 @@ impl Robots { /// assert!(!r.is_relative_allowed("/example/nope.txt")); /// assert!(!r.is_relative_allowed("/invalid/path.txt")); /// ``` - pub fn from_reader(reader: R, user_agent: &str) -> Result { + pub fn from_reader(reader: R, user_agent: &str) -> io::Result { let reader = reader.take(BYTE_LIMIT as u64); let mut reader = BufReader::new(reader); @@ -170,6 +174,7 @@ impl Robots { /// See [`RobotsBuilder::new`]. /// /// [`RobotsBuilder::new`]: crate::RobotsBuilder::new + #[inline] #[cfg(feature = "builder")] #[cfg_attr(docsrs, doc(cfg(feature = "builder")))] pub fn builder() -> crate::RobotsBuilder { @@ -195,6 +200,8 @@ impl Robots { /// assert_eq!(r.try_is_relative_allowed("/example/nope.txt"), Some(false)); /// assert_eq!(r.try_is_relative_allowed("/invalid/path.txt"), None); /// ``` + #[inline] + #[must_use] pub fn try_is_relative_allowed(&self, addr: &str) -> Option { self.inner.try_is_allowed(addr) } @@ -217,6 +224,8 @@ impl Robots { /// assert!(!r.is_relative_allowed("/example/nope.txt")); /// assert!(!r.is_relative_allowed("/invalid/path.txt")); /// ``` + #[inline] + #[must_use] pub fn is_relative_allowed(&self, addr: &str) -> bool { self.inner.is_allowed(addr) } @@ -240,18 +249,16 @@ impl Robots { /// assert_eq!(r.try_is_absolute_allowed(&base.join("/example/nope.txt").unwrap()), Some(false)); /// assert_eq!(r.try_is_absolute_allowed(&base.join("/invalid/path.txt").unwrap()), None); /// ``` + #[must_use] pub fn try_is_absolute_allowed(&self, addr: &Url) -> Option { let path = addr.path().to_owned(); - let query = addr - .query() - .map(|u| "?".to_owned() + u) - .unwrap_or("".to_owned()); + let query = addr.query().map(|u| "?".to_owned() + u).unwrap_or_default(); let frag = addr .fragment() .map(|u| "#".to_owned() + u) - .unwrap_or("".to_owned()); + .unwrap_or_default(); let relative = path + &query + &frag; self.inner.try_is_allowed(&relative) @@ -277,6 +284,7 @@ impl Robots { /// assert!(!r.is_absolute_allowed(&base.join("/example/nope.txt").unwrap())); /// assert!(!r.is_absolute_allowed(&base.join("/invalid/path.txt").unwrap())); /// ``` + #[must_use] pub fn is_absolute_allowed(&self, addr: &Url) -> bool { self.try_is_absolute_allowed(addr).unwrap_or(true) } @@ -292,6 +300,8 @@ impl Robots { /// let r = Robots::from_always(false, "foobot"); /// assert_eq!(r.is_always(), Some(false)); /// ``` + #[inline] + #[must_use] pub fn is_always(&self) -> Option { self.inner.is_always() } @@ -310,6 +320,8 @@ impl Robots { /// let r = Robots::from_bytes(txt, "foobot-search"); /// assert_eq!(r.user_agent(), "foobot"); /// ``` + #[inline] + #[must_use] pub fn user_agent(&self) -> &str { self.inner.user_agent() } @@ -328,7 +340,9 @@ impl Robots { /// let r = Robots::from_bytes(txt, "foobot"); /// assert_eq!(r.crawl_delay(), Some(Duration::from_secs(5))); /// ``` - pub fn crawl_delay(&self) -> Option { + #[inline] + #[must_use] + pub fn crawl_delay(&self) -> Option { self.inner.crawl_delay() } @@ -345,18 +359,24 @@ impl Robots { /// let r = Robots::from_bytes(txt, "foobot"); /// assert_eq!(r.sitemaps().len(), 2); /// ``` + #[inline] + #[must_use] pub fn sitemaps(&self) -> &[Url] { self.inner.sitemaps() } /// Returns the total amount of applied rules unless constructed /// with (or optimized to) the global rule. + #[inline] + #[must_use] pub fn len(&self) -> Option { self.inner.len() } /// Returns true if there are no applied rules i.e. it is constructed /// with (or optimized to) the global rule. + #[inline] + #[must_use] pub fn is_empty(&self) -> Option { self.inner.is_empty() } diff --git a/robotxt/parse/parser.rs b/robotxt/parse/parser.rs index 4e18e2b..d8e56ab 100644 --- a/robotxt/parse/parser.rs +++ b/robotxt/parse/parser.rs @@ -22,8 +22,8 @@ impl Parser { pub fn parse_rules(directives: &[Directive], user_agent: &str) -> Self { let (longest_match, captures_rules) = Self::longest_match(directives, user_agent); let mut state = Self { - longest_match, captures_rules, + longest_match, ..Self::default() }; @@ -60,7 +60,7 @@ impl Parser { // Finds the longest `User-Agent` in the acceptable pool. let selected_ua = acceptable_uas .max_by(|lhs, rhs| lhs.len().cmp(&rhs.len())) - .unwrap_or(ALL_UAS.to_string()); + .unwrap_or_else(|| ALL_UAS.to_string()); // Determines if it should check non-assigned rules. let check_non_assigned = selected_ua == ALL_UAS; diff --git a/robotxt/parse/rule.rs b/robotxt/parse/rule.rs index 176ecb1..1432a8f 100644 --- a/robotxt/parse/rule.rs +++ b/robotxt/parse/rule.rs @@ -176,14 +176,14 @@ impl Rule { /// Returns true if the path matches the pattern. /// NOTE: Expects normalized relative path. pub fn is_match(&self, path: &str) -> bool { - match &self.wildcard { - None => path.starts_with(self.pattern.as_str()), - Some(wildcard) => wildcard.is_match(path), - } + self.wildcard.as_ref().map_or_else( + || path.starts_with(self.pattern.as_str()), + |wildcard| wildcard.is_match(path), + ) } /// Returns true if allowed. - pub fn is_allowed(&self) -> bool { + pub const fn is_allowed(&self) -> bool { self.allow } diff --git a/robotxt/parse/serde.rs b/robotxt/parse/serde.rs index 65adf3a..25734a9 100644 --- a/robotxt/parse/serde.rs +++ b/robotxt/parse/serde.rs @@ -1,17 +1,18 @@ use serde::de::{Error, MapAccess, Visitor}; use serde::ser::SerializeStruct; +use serde::{Deserialize, Serialize}; use serde::{Deserializer, Serializer}; use crate::parse::inner::Rules; use crate::parse::rule::Rule; -impl serde::Serialize for Rules { +impl Serialize for Rules { fn serialize(&self, serializer: S) -> Result where S: Serializer, { match self { - Rules::Rules(rules) => { + Self::Rules(rules) => { let (allow, disallow): (Vec<_>, Vec<_>) = rules.iter().partition(|u| u.is_allowed()); let allow: Vec<_> = allow.iter().map(|u| u.pattern().to_string()).collect(); @@ -22,7 +23,7 @@ impl serde::Serialize for Rules { s.serialize_field("disallow", &disallow)?; s.end() } - Rules::Always(always) => { + Self::Always(always) => { let mut s = serializer.serialize_struct("AlwaysRules", 1)?; s.serialize_field("always", always)?; s.end() @@ -31,7 +32,7 @@ impl serde::Serialize for Rules { } } -impl<'de> serde::Deserialize<'de> for Rules { +impl<'de> Deserialize<'de> for Rules { fn deserialize(deserializer: D) -> Result where D: Deserializer<'de>, diff --git a/robotxt/paths/normal.rs b/robotxt/paths/normal.rs index 2a30950..55604c9 100644 --- a/robotxt/paths/normal.rs +++ b/robotxt/paths/normal.rs @@ -12,9 +12,9 @@ pub(crate) fn normalize_path(path: &str) -> String { // Url::make_relative strips leading and trailing / // https://github.com/servo/rust-url/issues/772 // https://github.com/servo/rust-url/issues/766 - if !path.starts_with('/') { - '/'.to_string() + &path - } else { + if path.starts_with('/') { path + } else { + '/'.to_string() + &path } } From 778597dbd0277481d783bd0973ccc6862891b183 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Tue, 16 Apr 2024 22:51:04 +0200 Subject: [PATCH 07/11] fix(sitemapo): rem auto, update --- Cargo.lock | 94 +++++++++++++++++++------------------- Cargo.toml | 10 ++-- robotxt/lib.rs | 12 ++--- sitemapo/Cargo.toml | 4 +- sitemapo/build/auto.rs | 101 ----------------------------------------- sitemapo/build/mod.rs | 12 ++--- 6 files changed, 65 insertions(+), 168 deletions(-) delete mode 100644 sitemapo/build/auto.rs diff --git a/Cargo.lock b/Cargo.lock index 8d28a44..c34da71 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -19,18 +19,18 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "aho-corasick" -version = "1.1.2" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" dependencies = [ "memchr", ] [[package]] name = "async-trait" -version = "0.1.77" +version = "0.1.80" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9" +checksum = "c6fa2087f2753a7da8cc1c0dbfcf89579dd57458e36769de5ac750b4671737ca" dependencies = [ "proc-macro2", "quote", @@ -39,15 +39,15 @@ dependencies = [ [[package]] name = "autocfg" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +checksum = "f1fdabc7756949593fe60f30ec81974b613357de856987752631dea1e3394c80" [[package]] name = "backtrace" -version = "0.3.69" +version = "0.3.71" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" +checksum = "26b05800d2e817c8b3b4b54abd461726265fa9789ae34330622f2db9ee696f9d" dependencies = [ "addr2line", "cc", @@ -71,15 +71,15 @@ dependencies = [ [[package]] name = "bytes" -version = "1.5.0" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" +checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" [[package]] name = "cc" -version = "1.0.90" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8cd6604a82acf3039f1144f54b8eb34e91ffba622051189e71b781822d5ee1f5" +checksum = "17f6e324229dc011159fcc089755d1e2e216a90d43a7dea6853ca740b84f35e7" [[package]] name = "cfg-if" @@ -89,20 +89,20 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "countio" -version = "0.2.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "deb137baf1ce7c00453689055b2a4311ed34d288cb2e66e892e3142e30dae67f" +version = "0.2.17" dependencies = [ + "futures-io", + "futures-test", + "futures-util", "tokio", ] [[package]] name = "countio" version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38f3ee5288f31b7a8c248759df862af9900e5e14513e4e77d003b49ce8e54605" dependencies = [ - "futures-io", - "futures-test", - "futures-util", "tokio", ] @@ -229,9 +229,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "libc" @@ -241,9 +241,9 @@ checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" [[package]] name = "memchr" -version = "2.7.1" +version = "2.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" +checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" [[package]] name = "minimal-lexical" @@ -331,9 +331,9 @@ dependencies = [ [[package]] name = "pin-project-lite" -version = "0.2.13" +version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" +checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" [[package]] name = "pin-utils" @@ -349,9 +349,9 @@ checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" [[package]] name = "proc-macro2" -version = "1.0.78" +version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" +checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e" dependencies = [ "unicode-ident", ] @@ -368,18 +368,18 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.35" +version = "1.0.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" dependencies = [ "proc-macro2", ] [[package]] name = "regex" -version = "1.10.3" +version = "1.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" +checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c" dependencies = [ "aho-corasick", "memchr", @@ -400,9 +400,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" +checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56" [[package]] name = "robotxt" @@ -452,9 +452,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.114" +version = "1.0.115" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5f09b1bd632ef549eaa9f60a1f8de742bdbc698e6cee2095fc84dde5f549ae0" +checksum = "12dc5c46daa8e9fdf4f5e71b6cf9a53f2487da0e86e55808e2d35539666497dd" dependencies = [ "itoa", "ryu", @@ -473,7 +473,7 @@ version = "0.2.0" dependencies = [ "async-trait", "bytes", - "countio 0.2.15", + "countio 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", "isolang", "quick-xml", "thiserror", @@ -493,9 +493,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.52" +version = "2.0.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07" +checksum = "44cfb93f38070beee36b3fef7d4f5a16f27751d94b187b666a5cc5e9b0d30687" dependencies = [ "proc-macro2", "quote", @@ -504,18 +504,18 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.57" +version = "1.0.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b" +checksum = "03468839009160513471e86a034bb2c5c0e4baae3b43f79ffc55c4a5427b3297" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.57" +version = "1.0.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81" +checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7" dependencies = [ "proc-macro2", "quote", @@ -524,9 +524,9 @@ dependencies = [ [[package]] name = "time" -version = "0.3.34" +version = "0.3.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8248b6521bb14bc45b4067159b9b6ad792e2d6d754d6c41fb50e29fefe38749" +checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885" dependencies = [ "deranged", "itoa", @@ -545,9 +545,9 @@ checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" [[package]] name = "time-macros" -version = "0.2.17" +version = "0.2.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ba3a3ef41e6672a2f0f001392bb5dcd3ff0a9992d618ca761a11c3121547774" +checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf" dependencies = [ "num-conv", "time-core", @@ -570,9 +570,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.36.0" +version = "1.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931" +checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787" dependencies = [ "backtrace", "bytes", diff --git a/Cargo.toml b/Cargo.toml index 8dc43e9..52cfcc9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,15 +14,15 @@ authors = ["Oleh Martsokha "] license = "MIT" [workspace.dependencies] -tokio = { version = "1.36.0", default-features = false } +tokio = { version = "1.37.0", default-features = false } futures-io = { version = "0.3.30", default-features = false } futures-util = { version = "0.3.30", default-features = false } futures-test = { version = "0.3.30", default-features = false } url = { version = "2.5.0" } -async-trait = { version = "0.1.77" } -thiserror = { version = "1.0.57" } +async-trait = { version = "0.1.80" } +thiserror = { version = "1.0.58" } serde = { version = "1.0.197" } -serde_json = { version = "1.0.114" } -time = { version = "0.3.34", default-features = false } +serde_json = { version = "1.0.115" } +time = { version = "0.3.36", default-features = false } diff --git a/robotxt/lib.rs b/robotxt/lib.rs index 9378f18..5ad475c 100644 --- a/robotxt/lib.rs +++ b/robotxt/lib.rs @@ -8,8 +8,8 @@ pub use url; #[cfg(feature = "builder")] pub use build::{GroupBuilder, RobotsBuilder}; #[cfg(feature = "parser")] -pub use parse::{AccessResult, ALL_UAS, Robots}; -pub use paths::{BYTE_LIMIT, create_url}; +pub use parse::{AccessResult, Robots, ALL_UAS}; +pub use paths::{create_url, BYTE_LIMIT}; /// Unrecoverable failure during `robots.txt` building or parsing. /// @@ -52,10 +52,10 @@ mod parse; #[doc(hidden)] pub mod prelude { - pub use super::{Error, Result}; #[cfg(feature = "builder")] - pub use super::build::{GroupBuilder, RobotsBuilder}; + pub use super::build::{GroupBuilder, RobotsBuilder}; #[cfg(feature = "parser")] - pub use super::parse::{AccessResult, ALL_UAS, Robots}; - pub use super::paths::{BYTE_LIMIT, create_url}; + pub use super::parse::{AccessResult, Robots, ALL_UAS}; + pub use super::paths::{create_url, BYTE_LIMIT}; + pub use super::{Error, Result}; } diff --git a/sitemapo/Cargo.toml b/sitemapo/Cargo.toml index 1722c07..c5d18f2 100644 --- a/sitemapo/Cargo.toml +++ b/sitemapo/Cargo.toml @@ -42,10 +42,10 @@ extension = ["dep:isolang"] [dependencies] url = { workspace = true } thiserror = { workspace = true } -countio = { version = "0.2.15" } +countio = { version = "0.2.17" } quick-xml = { version = "0.31.0" } -bytes = { version = "1.5.0", features = [] } +bytes = { version = "1.6.0", features = [] } time = { workspace = true, features = ["parsing", "formatting"] } tokio = { workspace = true, optional = true } diff --git a/sitemapo/build/auto.rs b/sitemapo/build/auto.rs deleted file mode 100644 index c85dce1..0000000 --- a/sitemapo/build/auto.rs +++ /dev/null @@ -1,101 +0,0 @@ -use url::Url; - -use crate::build::{EntryBuilder, IndexBuilder}; -use crate::record::Entry; -use crate::Error; - -/// TODO: Desc. -/// -/// Automatic sitemap file constructor. -/// NOTE: Does not deduplicate records. -/// -/// ```rust -/// #[derive(Debug, thiserror::Error)] -/// enum CustomError { -/// // .. -/// #[error("sitemap error: {0}")] -/// Sitemap(#[from] sitemapo::Error), -/// //.. -/// } -/// -/// fn main() -> Result<(), CustomError> { -/// Ok(()) -/// } -/// ``` -pub struct AutoBuilder { - index: Option>, - entry: Vec>, - queue: Vec, - // factory: impl Fn() -> W, -} - -impl AutoBuilder { - /// TODO: Desc. - pub fn new() -> Self { - todo!() - } -} - -impl AutoBuilder -where - W: std::io::Write, -{ - /// TODO: Desc. - pub fn try_sync(&mut self, fetcher: A) -> Result<(), E> - where - E: std::error::Error + From, - A: Fn(Url) -> Result, E>, - { - // if let Some(builder) = self.entry.as_mut() { - // builder.write(record) - // } - - todo!() - } -} - -#[cfg(feature = "tokio")] -#[cfg_attr(docsrs, doc(cfg(feature = "tokio")))] -impl AutoBuilder -where - W: tokio::io::AsyncWrite + Unpin + Send, -{ - /// TODO: Desc. - pub async fn try_async(&mut self) -> Result<(), Error> { - todo!() - } -} - -impl std::fmt::Debug for AutoBuilder { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - // TODO: Debug. - f.debug_struct("AutoBuilder").finish() - } -} - -// impl Default for AutoBuilder { -// fn default() -> Self { -// Self { -// entry: None, -// index: None, -// } -// } -// } - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn sync() -> Result<(), Error> { - // TODO: Test. - Ok(()) - } - - #[cfg(feature = "tokio")] - #[tokio::test] - async fn asynk() -> Result<(), Error> { - // TODO: Test. - Ok(()) - } -} diff --git a/sitemapo/build/mod.rs b/sitemapo/build/mod.rs index 9a60e67..7c269bb 100644 --- a/sitemapo/build/mod.rs +++ b/sitemapo/build/mod.rs @@ -1,15 +1,13 @@ -mod auto; -mod entry; -mod index; -mod inner; -mod plain; - -pub use auto::*; pub use entry::*; pub use index::*; pub(crate) use inner::*; pub use plain::*; +mod entry; +mod index; +mod inner; +mod plain; + // TODO: Make builders take BufWrite. /// Core trait for the builder implementation. From 4474a319f2e2ccae37503a875dcd794ad454be3d Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Sun, 12 May 2024 03:29:07 +0200 Subject: [PATCH 08/11] feat(all): pedantic, rem lock --- .github/workflows/build.yaml | 16 +- .github/workflows/publish.yaml | 7 +- .gitignore | 2 + Cargo.lock | 625 --------------------------------- Cargo.toml | 6 +- countio/README.md | 3 +- countio/counter/futures.rs | 2 +- countio/counter/stdlib.rs | 2 + countio/counter/tokio.rs | 2 +- robotxt/Cargo.toml | 2 +- robotxt/README.md | 3 +- robotxt/build/group.rs | 9 +- robotxt/lib.rs | 13 +- robotxt/parse/inner.rs | 4 +- robotxt/paths/mod.rs | 2 +- robotxt/paths/normal.rs | 2 +- sitemapo/Cargo.toml | 2 +- sitemapo/README.md | 3 +- sitemapo/lib.rs | 8 +- sitemapo/record/index.rs | 2 +- 20 files changed, 40 insertions(+), 675 deletions(-) delete mode 100644 Cargo.lock diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 54ee07d..1b82b26 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -1,11 +1,11 @@ +name: Build + on: push: branches: - main pull_request: -name: Build - jobs: ci: name: CI @@ -25,17 +25,7 @@ jobs: toolchain: ${{ matrix.toolchain }} - name: Set up Cache - uses: actions/cache@v4 - with: - path: | - ~/.cargo/bin/ - ~/.cargo/registry/index/ - ~/.cargo/registry/cache/ - ~/.cargo/git/db/ - ~/.cargo/.crates.toml - ~/.cargo/.crates2.json - key: ${{ runner.os }}-${{ matrix.toolchain }}-cargo-${{ hashFiles('**/Cargo.lock') }} - restore-keys: ${{ runner.os }}-${{ matrix.toolchain }}-cargo- + uses: Swatinem/rust-cache@v2 - name: Install Tarpaulin if: matrix.os == 'ubuntu-latest' diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 3062f4e..08d25dd 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -1,11 +1,8 @@ +name: Publish + on: - push: - branches: - - main workflow_dispatch: -name: Publish - jobs: cd: name: CD diff --git a/.gitignore b/.gitignore index bb27ea8..b4e5423 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # OS Thumbs.db .DS_Store +*.pdb # Editors .vs/ @@ -11,6 +12,7 @@ Thumbs.db # Lang: Rust debug/ target/ +Cargo.lock **/*.rs.bk # Environment diff --git a/Cargo.lock b/Cargo.lock deleted file mode 100644 index c34da71..0000000 --- a/Cargo.lock +++ /dev/null @@ -1,625 +0,0 @@ -# This file is automatically @generated by Cargo. -# It is not intended for manual editing. -version = 3 - -[[package]] -name = "addr2line" -version = "0.21.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" -dependencies = [ - "gimli", -] - -[[package]] -name = "adler" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" - -[[package]] -name = "aho-corasick" -version = "1.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" -dependencies = [ - "memchr", -] - -[[package]] -name = "async-trait" -version = "0.1.80" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6fa2087f2753a7da8cc1c0dbfcf89579dd57458e36769de5ac750b4671737ca" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "autocfg" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1fdabc7756949593fe60f30ec81974b613357de856987752631dea1e3394c80" - -[[package]] -name = "backtrace" -version = "0.3.71" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26b05800d2e817c8b3b4b54abd461726265fa9789ae34330622f2db9ee696f9d" -dependencies = [ - "addr2line", - "cc", - "cfg-if", - "libc", - "miniz_oxide", - "object", - "rustc-demangle", -] - -[[package]] -name = "bstr" -version = "1.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05efc5cfd9110c8416e471df0e96702d58690178e206e61b7173706673c93706" -dependencies = [ - "memchr", - "regex-automata", - "serde", -] - -[[package]] -name = "bytes" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" - -[[package]] -name = "cc" -version = "1.0.94" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17f6e324229dc011159fcc089755d1e2e216a90d43a7dea6853ca740b84f35e7" - -[[package]] -name = "cfg-if" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" - -[[package]] -name = "countio" -version = "0.2.17" -dependencies = [ - "futures-io", - "futures-test", - "futures-util", - "tokio", -] - -[[package]] -name = "countio" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38f3ee5288f31b7a8c248759df862af9900e5e14513e4e77d003b49ce8e54605" -dependencies = [ - "tokio", -] - -[[package]] -name = "deranged" -version = "0.3.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4" -dependencies = [ - "powerfmt", -] - -[[package]] -name = "form_urlencoded" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" -dependencies = [ - "percent-encoding", -] - -[[package]] -name = "futures-core" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" - -[[package]] -name = "futures-executor" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" -dependencies = [ - "futures-core", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-io" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" - -[[package]] -name = "futures-macro" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "futures-sink" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" - -[[package]] -name = "futures-task" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" - -[[package]] -name = "futures-test" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce388237b32ac42eca0df1ba55ed3bbda4eaf005d7d4b5dbc0b20ab962928ac9" -dependencies = [ - "futures-core", - "futures-executor", - "futures-io", - "futures-macro", - "futures-sink", - "futures-task", - "futures-util", - "pin-project", - "pin-utils", -] - -[[package]] -name = "futures-util" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" -dependencies = [ - "futures-core", - "futures-io", - "futures-task", - "memchr", - "pin-project-lite", - "pin-utils", - "slab", -] - -[[package]] -name = "gimli" -version = "0.28.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" - -[[package]] -name = "idna" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" -dependencies = [ - "unicode-bidi", - "unicode-normalization", -] - -[[package]] -name = "isolang" -version = "2.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe50d48c77760c55188549098b9a7f6e37ae980c586a24693d6b01c3b2010c3c" -dependencies = [ - "phf", -] - -[[package]] -name = "itoa" -version = "1.0.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" - -[[package]] -name = "libc" -version = "0.2.153" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" - -[[package]] -name = "memchr" -version = "2.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" - -[[package]] -name = "minimal-lexical" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" - -[[package]] -name = "miniz_oxide" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" -dependencies = [ - "adler", -] - -[[package]] -name = "nom" -version = "7.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" -dependencies = [ - "memchr", - "minimal-lexical", -] - -[[package]] -name = "num-conv" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" - -[[package]] -name = "object" -version = "0.32.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" -dependencies = [ - "memchr", -] - -[[package]] -name = "percent-encoding" -version = "2.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" - -[[package]] -name = "phf" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" -dependencies = [ - "phf_shared", -] - -[[package]] -name = "phf_shared" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" -dependencies = [ - "siphasher", -] - -[[package]] -name = "pin-project" -version = "1.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3" -dependencies = [ - "pin-project-internal", -] - -[[package]] -name = "pin-project-internal" -version = "1.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "pin-project-lite" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" - -[[package]] -name = "pin-utils" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" - -[[package]] -name = "powerfmt" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" - -[[package]] -name = "proc-macro2" -version = "1.0.79" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e" -dependencies = [ - "unicode-ident", -] - -[[package]] -name = "quick-xml" -version = "0.31.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" -dependencies = [ - "memchr", - "tokio", -] - -[[package]] -name = "quote" -version = "1.0.36" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" -dependencies = [ - "proc-macro2", -] - -[[package]] -name = "regex" -version = "1.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c" -dependencies = [ - "aho-corasick", - "memchr", - "regex-automata", - "regex-syntax", -] - -[[package]] -name = "regex-automata" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", -] - -[[package]] -name = "regex-syntax" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56" - -[[package]] -name = "robotxt" -version = "0.6.1" -dependencies = [ - "bstr", - "nom", - "percent-encoding", - "regex", - "serde", - "serde_json", - "thiserror", - "url", -] - -[[package]] -name = "rustc-demangle" -version = "0.1.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" - -[[package]] -name = "ryu" -version = "1.0.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" - -[[package]] -name = "serde" -version = "1.0.197" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde_derive" -version = "1.0.197" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "serde_json" -version = "1.0.115" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12dc5c46daa8e9fdf4f5e71b6cf9a53f2487da0e86e55808e2d35539666497dd" -dependencies = [ - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "siphasher" -version = "0.3.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" - -[[package]] -name = "sitemapo" -version = "0.2.0" -dependencies = [ - "async-trait", - "bytes", - "countio 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", - "isolang", - "quick-xml", - "thiserror", - "time", - "tokio", - "url", -] - -[[package]] -name = "slab" -version = "0.4.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" -dependencies = [ - "autocfg", -] - -[[package]] -name = "syn" -version = "2.0.58" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44cfb93f38070beee36b3fef7d4f5a16f27751d94b187b666a5cc5e9b0d30687" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "thiserror" -version = "1.0.58" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03468839009160513471e86a034bb2c5c0e4baae3b43f79ffc55c4a5427b3297" -dependencies = [ - "thiserror-impl", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.58" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "time" -version = "0.3.36" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885" -dependencies = [ - "deranged", - "itoa", - "num-conv", - "powerfmt", - "serde", - "time-core", - "time-macros", -] - -[[package]] -name = "time-core" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" - -[[package]] -name = "time-macros" -version = "0.2.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf" -dependencies = [ - "num-conv", - "time-core", -] - -[[package]] -name = "tinyvec" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" -dependencies = [ - "tinyvec_macros", -] - -[[package]] -name = "tinyvec_macros" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" - -[[package]] -name = "tokio" -version = "1.37.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787" -dependencies = [ - "backtrace", - "bytes", - "pin-project-lite", - "tokio-macros", -] - -[[package]] -name = "tokio-macros" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "unicode-bidi" -version = "0.3.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" - -[[package]] -name = "unicode-ident" -version = "1.0.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" - -[[package]] -name = "unicode-normalization" -version = "0.1.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" -dependencies = [ - "tinyvec", -] - -[[package]] -name = "url" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" -dependencies = [ - "form_urlencoded", - "idna", - "percent-encoding", - "serde", -] diff --git a/Cargo.toml b/Cargo.toml index 52cfcc9..2cb6861 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,8 +21,8 @@ futures-test = { version = "0.3.30", default-features = false } url = { version = "2.5.0" } async-trait = { version = "0.1.80" } -thiserror = { version = "1.0.58" } +thiserror = { version = "1.0.60" } -serde = { version = "1.0.197" } -serde_json = { version = "1.0.115" } +serde = { version = "1.0.201" } +serde_json = { version = "1.0.117" } time = { version = "0.3.36", default-features = false } diff --git a/countio/README.md b/countio/README.md index 1255629..7b56a96 100644 --- a/countio/README.md +++ b/countio/README.md @@ -5,8 +5,7 @@ [![Crate Version][crates-badge]][crates-url] [![Crate Coverage][coverage-badge]][coverage-url] -**Also check out other `spire-rs` projects -[here](https://github.com/spire-rs).** +**Check out other `spire` projects [here](https://github.com/spire-rs).** [action-badge]: https://img.shields.io/github/actions/workflow/status/spire-rs/kit/build.yaml?branch=main&label=build&logo=github&style=flat-square [action-url]: https://github.com/spire-rs/kit/actions/workflows/build.yaml diff --git a/countio/counter/futures.rs b/countio/counter/futures.rs index 64b0f8a..545bd3e 100644 --- a/countio/counter/futures.rs +++ b/countio/counter/futures.rs @@ -48,7 +48,7 @@ impl AsyncWrite for Counter { let poll = pin.poll_write(ctx, buf); if let Poll::Ready(Ok(bytes)) = poll { - counter.writer_bytes += bytes + counter.writer_bytes += bytes; } poll diff --git a/countio/counter/stdlib.rs b/countio/counter/stdlib.rs index adfc8e3..edcf7a4 100644 --- a/countio/counter/stdlib.rs +++ b/countio/counter/stdlib.rs @@ -30,12 +30,14 @@ impl Write for Counter { Ok(bytes) } + #[inline] fn flush(&mut self) -> Result<()> { self.inner.flush() } } impl Seek for Counter { + #[inline] fn seek(&mut self, pos: SeekFrom) -> Result { self.inner.seek(pos) } diff --git a/countio/counter/tokio.rs b/countio/counter/tokio.rs index 6e97336..2b7ddc0 100644 --- a/countio/counter/tokio.rs +++ b/countio/counter/tokio.rs @@ -53,7 +53,7 @@ impl AsyncWrite for Counter { let poll = pin.poll_write(ctx, buf); if let Poll::Ready(Ok(bytes)) = poll { - counter.writer_bytes += bytes + counter.writer_bytes += bytes; } poll diff --git a/robotxt/Cargo.toml b/robotxt/Cargo.toml index 6e09410..dc043b0 100644 --- a/robotxt/Cargo.toml +++ b/robotxt/Cargo.toml @@ -2,7 +2,7 @@ [package] name = "robotxt" -version = "0.6.1" +version = "0.6.2" readme = "./README.md" edition = { workspace = true } diff --git a/robotxt/README.md b/robotxt/README.md index ff24361..f5ff32b 100644 --- a/robotxt/README.md +++ b/robotxt/README.md @@ -5,8 +5,7 @@ [![Crate Version][crates-badge]][crates-url] [![Crate Coverage][coverage-badge]][coverage-url] -**Also check out other `spire-rs` projects -[here](https://github.com/spire-rs).** +**Check out other `spire` projects [here](https://github.com/spire-rs).** [action-badge]: https://img.shields.io/github/actions/workflow/status/spire-rs/kit/build.yaml?branch=main&label=build&logo=github&style=flat-square [action-url]: https://github.com/spire-rs/kit/actions/workflows/build.yaml diff --git a/robotxt/build/group.rs b/robotxt/build/group.rs index 220c704..737a1f7 100644 --- a/robotxt/build/group.rs +++ b/robotxt/build/group.rs @@ -144,9 +144,10 @@ impl Display for GroupBuilder { // Explicit Allow: * if no Disallows. // Used to interrupt the user-group i.e. // user-agent: a ..no rules.. user-agent: b - match self.rules_disallow.is_empty() { - true => Some("Allow: *".to_string()), - false => None, + if self.rules_disallow.is_empty() { + Some("Allow: *".to_string()) + } else { + None } } else { let rd = self.rules_allow.iter(); @@ -155,7 +156,7 @@ impl Display for GroupBuilder { }; let result = [header, agents, delay, disallows, allows, footer]; - let result = result.iter().filter_map(|u| u.clone()); + let result = result.iter().filter_map(Clone::clone); let result = result.collect::>().join("\n"); write!(f, "{}", result.as_str()) } diff --git a/robotxt/lib.rs b/robotxt/lib.rs index 5ad475c..0d5278d 100644 --- a/robotxt/lib.rs +++ b/robotxt/lib.rs @@ -1,6 +1,7 @@ #![forbid(unsafe_code)] #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("./README.md")] +#![warn(clippy::all, clippy::pedantic, clippy::nursery, clippy::cargo)] // Re-exports pub use url; @@ -8,8 +9,8 @@ pub use url; #[cfg(feature = "builder")] pub use build::{GroupBuilder, RobotsBuilder}; #[cfg(feature = "parser")] -pub use parse::{AccessResult, Robots, ALL_UAS}; -pub use paths::{create_url, BYTE_LIMIT}; +pub use parse::{AccessResult, ALL_UAS, Robots}; +pub use paths::{BYTE_LIMIT, create_url}; /// Unrecoverable failure during `robots.txt` building or parsing. /// @@ -52,10 +53,10 @@ mod parse; #[doc(hidden)] pub mod prelude { + pub use super::{Error, Result}; #[cfg(feature = "builder")] - pub use super::build::{GroupBuilder, RobotsBuilder}; + pub use super::build::{GroupBuilder, RobotsBuilder}; #[cfg(feature = "parser")] - pub use super::parse::{AccessResult, Robots, ALL_UAS}; - pub use super::paths::{create_url, BYTE_LIMIT}; - pub use super::{Error, Result}; + pub use super::parse::{AccessResult, ALL_UAS, Robots}; + pub use super::paths::{BYTE_LIMIT, create_url}; } diff --git a/robotxt/parse/inner.rs b/robotxt/parse/inner.rs index dc26205..9bf65af 100644 --- a/robotxt/parse/inner.rs +++ b/robotxt/parse/inner.rs @@ -14,7 +14,7 @@ use crate::BYTE_LIMIT; /// The [`Rules`] enum determines if the [`RobotsInner::is_allowed`] results /// from the set of [`Rule`]s or the single provided global rule. #[derive(Debug, Clone, PartialEq, Eq)] -pub(crate) enum Rules { +pub enum Rules { Rules(Vec), Always(bool), } @@ -99,7 +99,7 @@ impl RobotsInner { path => rules .iter() .find(|r| r.is_match(path)) - .map(|rule| rule.is_allowed()), + .map(Rule::is_allowed), }, } } diff --git a/robotxt/paths/mod.rs b/robotxt/paths/mod.rs index 1dbe15b..6f48b94 100644 --- a/robotxt/paths/mod.rs +++ b/robotxt/paths/mod.rs @@ -1,5 +1,5 @@ pub use create::create_url; -pub(crate) use normal::normalize_path; +pub use normal::normalize_path; mod create; mod normal; diff --git a/robotxt/paths/normal.rs b/robotxt/paths/normal.rs index 55604c9..c4ab12d 100644 --- a/robotxt/paths/normal.rs +++ b/robotxt/paths/normal.rs @@ -4,7 +4,7 @@ use percent_encoding::{utf8_percent_encode, AsciiSet, CONTROLS}; /// Returns the prefixed & percent-encoded path. /// NOTE: Expects relative path. -pub(crate) fn normalize_path(path: &str) -> String { +pub fn normalize_path(path: &str) -> String { static FRAGMENT: OnceLock = OnceLock::new(); let fragment = FRAGMENT.get_or_init(|| CONTROLS.add(b' ').add(b'"').add(b'<').add(b'>')); let path = utf8_percent_encode(path, fragment).to_string(); diff --git a/sitemapo/Cargo.toml b/sitemapo/Cargo.toml index c5d18f2..49807cc 100644 --- a/sitemapo/Cargo.toml +++ b/sitemapo/Cargo.toml @@ -2,7 +2,7 @@ [package] name = "sitemapo" -version = "0.2.0" +version = "0.3.0" readme = "./README.md" edition = { workspace = true } diff --git a/sitemapo/README.md b/sitemapo/README.md index 643ae00..eaa4db6 100644 --- a/sitemapo/README.md +++ b/sitemapo/README.md @@ -5,8 +5,7 @@ [![Crate Version][crates-badge]][crates-url] [![Crate Coverage][coverage-badge]][coverage-url] -**Also check out other `spire-rs` projects -[here](https://github.com/spire-rs).** +**Check out other `spire` projects [here](https://github.com/spire-rs).** [action-badge]: https://img.shields.io/github/actions/workflow/status/spire-rs/kit/build.yaml?branch=main&label=build&logo=github&style=flat-square [action-url]: https://github.com/spire-rs/kit/actions/workflows/build.yaml diff --git a/sitemapo/lib.rs b/sitemapo/lib.rs index dc7a00b..6239cd6 100644 --- a/sitemapo/lib.rs +++ b/sitemapo/lib.rs @@ -1,6 +1,10 @@ #![forbid(unsafe_code)] #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("./README.md")] +#![warn(clippy::all, clippy::pedantic, clippy::nursery, clippy::cargo)] + +// Re-exports +pub use url; /// Unrecoverable failure during `sitemap.xml` building or parsing. /// @@ -38,9 +42,6 @@ pub enum Error { /// [`sitemapo`]: crate pub type Result = std::result::Result; -// Re-exports -pub use url; - /// Builder types: `AutoBuilder`, `TxtBuilder` & `XmlBuilder`. pub mod build; /// Parser types: `AutoParser`, `TxtParser` & `XmlParser`. @@ -51,7 +52,6 @@ pub mod record; #[doc(hidden)] pub mod prelude { pub use super::{Error, Result}; - pub use super::build::*; pub use super::parse::*; pub use super::record::*; diff --git a/sitemapo/record/index.rs b/sitemapo/record/index.rs index bbfa216..716a9ae 100644 --- a/sitemapo/record/index.rs +++ b/sitemapo/record/index.rs @@ -37,6 +37,6 @@ impl Index { impl From for Index { fn from(location: Url) -> Self { - Index::new(location) + Self::new(location) } } From 22d2f437eb1f7ae2dbd9a214613ffe88eb11014c Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Tue, 18 Jun 2024 11:46:15 +0200 Subject: [PATCH 09/11] feat(all): upgrade/update deps --- Cargo.toml | 20 ++++++++--------- robotxt/Cargo.toml | 8 +++---- robotxt/build/group.rs | 2 +- robotxt/build/mod.rs | 2 +- robotxt/lib.rs | 12 +++++------ sitemapo/Cargo.toml | 8 +++---- sitemapo/build/entry.rs | 7 ++++-- sitemapo/build/index.rs | 6 ++++-- sitemapo/build/inner.rs | 2 +- sitemapo/build/plain.rs | 3 ++- sitemapo/lib.rs | 2 +- sitemapo/parse/auto.rs | 14 +++++++----- sitemapo/parse/entry.rs | 12 +++++------ sitemapo/parse/index.rs | 12 +++++------ sitemapo/parse/inner.rs | 8 +++---- sitemapo/parse/mod.rs | 14 ++++++------ sitemapo/record/entry.rs | 11 +++++----- sitemapo/record/frequency.rs | 42 +++++++++++++++++------------------- sitemapo/record/index.rs | 11 +++++----- sitemapo/record/priority.rs | 6 ++++-- 20 files changed, 106 insertions(+), 96 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 2cb6861..ecb42d5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,15 +14,15 @@ authors = ["Oleh Martsokha "] license = "MIT" [workspace.dependencies] -tokio = { version = "1.37.0", default-features = false } -futures-io = { version = "0.3.30", default-features = false } -futures-util = { version = "0.3.30", default-features = false } -futures-test = { version = "0.3.30", default-features = false } +tokio = { version = "1.38", default-features = false } +futures-io = { version = "0.3", default-features = false } +futures-util = { version = "0.3", default-features = false } +futures-test = { version = "0.3", default-features = false } -url = { version = "2.5.0" } -async-trait = { version = "0.1.80" } -thiserror = { version = "1.0.60" } +url = { version = "2.5" } +async-trait = { version = "0.1" } +thiserror = { version = "1.0" } -serde = { version = "1.0.201" } -serde_json = { version = "1.0.117" } -time = { version = "0.3.36", default-features = false } +serde = { version = "1.0" } +serde_json = { version = "1.0" } +time = { version = "0.3", default-features = false } diff --git a/robotxt/Cargo.toml b/robotxt/Cargo.toml index dc043b0..8338574 100644 --- a/robotxt/Cargo.toml +++ b/robotxt/Cargo.toml @@ -47,11 +47,11 @@ serde = ["dep:serde", "url/serde", "serde/derive", "serde/rc"] [dependencies] url = { workspace = true } thiserror = { workspace = true } -percent-encoding = { version = "2.3.1" } +percent-encoding = { version = "2.3" } -nom = { version = "7.1.3", optional = true } -bstr = { version = "1.9.1", optional = true } -regex = { version = "1.10.4", optional = true } +nom = { version = "7.1", optional = true } +bstr = { version = "1.9", optional = true } +regex = { version = "1.10", optional = true } serde = { workspace = true, optional = true } [dev-dependencies] diff --git a/robotxt/build/group.rs b/robotxt/build/group.rs index 737a1f7..1cdb84a 100644 --- a/robotxt/build/group.rs +++ b/robotxt/build/group.rs @@ -85,7 +85,7 @@ impl GroupBuilder { /// u.crawl_delay(5) /// }); /// ``` - pub fn crawl_delay(mut self, delay: u16) -> Self { + pub const fn crawl_delay(mut self, delay: u16) -> Self { self.delay = Some(delay); self } diff --git a/robotxt/build/mod.rs b/robotxt/build/mod.rs index faeff95..c9a8332 100644 --- a/robotxt/build/mod.rs +++ b/robotxt/build/mod.rs @@ -107,7 +107,7 @@ impl fmt::Display for RobotsBuilder { let header = self.header.as_ref().map(|h| format_comment(h)); let footer = self.footer.as_ref().map(|f| format_comment(f)); - let groups = self.groups.iter().map(|u| u.to_string()); + let groups = self.groups.iter().map(ToString::to_string); let groups = groups.collect::>().join("\n\n"); let result = [header, Some(groups), footer]; diff --git a/robotxt/lib.rs b/robotxt/lib.rs index 0d5278d..b03ba45 100644 --- a/robotxt/lib.rs +++ b/robotxt/lib.rs @@ -9,8 +9,8 @@ pub use url; #[cfg(feature = "builder")] pub use build::{GroupBuilder, RobotsBuilder}; #[cfg(feature = "parser")] -pub use parse::{AccessResult, ALL_UAS, Robots}; -pub use paths::{BYTE_LIMIT, create_url}; +pub use parse::{AccessResult, Robots, ALL_UAS}; +pub use paths::{create_url, BYTE_LIMIT}; /// Unrecoverable failure during `robots.txt` building or parsing. /// @@ -53,10 +53,10 @@ mod parse; #[doc(hidden)] pub mod prelude { - pub use super::{Error, Result}; #[cfg(feature = "builder")] - pub use super::build::{GroupBuilder, RobotsBuilder}; + pub use super::build::{GroupBuilder, RobotsBuilder}; #[cfg(feature = "parser")] - pub use super::parse::{AccessResult, ALL_UAS, Robots}; - pub use super::paths::{BYTE_LIMIT, create_url}; + pub use super::parse::{AccessResult, Robots, ALL_UAS}; + pub use super::paths::{create_url, BYTE_LIMIT}; + pub use super::{Error, Result}; } diff --git a/sitemapo/Cargo.toml b/sitemapo/Cargo.toml index 49807cc..702af36 100644 --- a/sitemapo/Cargo.toml +++ b/sitemapo/Cargo.toml @@ -42,15 +42,15 @@ extension = ["dep:isolang"] [dependencies] url = { workspace = true } thiserror = { workspace = true } -countio = { version = "0.2.17" } +countio = { version = "0.2" } -quick-xml = { version = "0.31.0" } -bytes = { version = "1.6.0", features = [] } +quick-xml = { version = "0.31" } +bytes = { version = "1.6", features = [] } time = { workspace = true, features = ["parsing", "formatting"] } tokio = { workspace = true, optional = true } async-trait = { workspace = true, optional = true } -isolang = { version = "2.4.0", optional = true, features = [] } +isolang = { version = "2.4", optional = true, features = [] } [dev-dependencies] time = { workspace = true, features = ["macros"] } diff --git a/sitemapo/build/entry.rs b/sitemapo/build/entry.rs index 46508a5..13e3fc1 100644 --- a/sitemapo/build/entry.rs +++ b/sitemapo/build/entry.rs @@ -4,7 +4,10 @@ use quick_xml::{events, Writer}; use time::format_description::well_known::Iso8601; use crate::build::{Builder, InnerBuilder, CONFIG}; -use crate::record::*; +use crate::record::{ + Entry, BYTE_LIMIT, CHANGE_FREQUENCY, LAST_MODIFIED, LOCATION, PRIORITY, RECORD_LIMIT, URL, + URL_SET, +}; use crate::{Error, Result}; /// Sitemap builder for the versatile XML file with an optional support of extensions. @@ -51,7 +54,7 @@ impl EntryBuilder { } /// Creates a new instance with the given inner parser. - pub(crate) fn from_inner(inner: InnerBuilder) -> Self { + pub(crate) const fn from_inner(inner: InnerBuilder) -> Self { Self { inner } } diff --git a/sitemapo/build/index.rs b/sitemapo/build/index.rs index 10222b1..88ab77d 100644 --- a/sitemapo/build/index.rs +++ b/sitemapo/build/index.rs @@ -4,7 +4,9 @@ use quick_xml::{events, Writer}; use time::format_description::well_known::Iso8601; use crate::build::{Builder, InnerBuilder, CONFIG}; -use crate::record::*; +use crate::record::{ + Index, BYTE_LIMIT, LAST_MODIFIED, LOCATION, RECORD_LIMIT, SITEMAP, SITEMAP_INDEX, +}; use crate::{Error, Result}; /// Sitemap index parser for the versatile XML file. @@ -51,7 +53,7 @@ impl IndexBuilder { } /// Creates a new instance with the given inner parser. - pub(crate) fn from_inner(inner: InnerBuilder) -> Self { + pub(crate) const fn from_inner(inner: InnerBuilder) -> Self { Self { inner } } diff --git a/sitemapo/build/inner.rs b/sitemapo/build/inner.rs index 42dea65..404b591 100644 --- a/sitemapo/build/inner.rs +++ b/sitemapo/build/inner.rs @@ -6,7 +6,7 @@ use time::format_description::well_known::iso8601; use crate::Error; -pub(crate) const CONFIG: iso8601::EncodedConfig = iso8601::Config::DEFAULT +pub const CONFIG: iso8601::EncodedConfig = iso8601::Config::DEFAULT .set_time_precision(iso8601::TimePrecision::Second { decimal_digits: NonZeroU8::new(2), }) diff --git a/sitemapo/build/plain.rs b/sitemapo/build/plain.rs index 1b82a75..eb6efd3 100644 --- a/sitemapo/build/plain.rs +++ b/sitemapo/build/plain.rs @@ -4,7 +4,7 @@ use countio::Counter; use url::Url; use crate::build::Builder; -use crate::record::*; +use crate::record::{BYTE_LIMIT, RECORD_LIMIT}; use crate::{Error, Result}; /// Sitemap builder for the simple TXT file that contains one URL per line. @@ -144,6 +144,7 @@ mod tokio { #[cfg(test)] mod test { use std::io::BufWriter; + use url::Url; use crate::build::{Builder, PlainBuilder}; diff --git a/sitemapo/lib.rs b/sitemapo/lib.rs index 6239cd6..89a3402 100644 --- a/sitemapo/lib.rs +++ b/sitemapo/lib.rs @@ -51,8 +51,8 @@ pub mod record; #[doc(hidden)] pub mod prelude { - pub use super::{Error, Result}; pub use super::build::*; pub use super::parse::*; pub use super::record::*; + pub use super::{Error, Result}; } diff --git a/sitemapo/parse/auto.rs b/sitemapo/parse/auto.rs index 3e3f9a2..e602fff 100644 --- a/sitemapo/parse/auto.rs +++ b/sitemapo/parse/auto.rs @@ -3,7 +3,11 @@ use countio::Counter; use quick_xml::{events, Reader}; use url::Url; -use crate::{parse::*, record::*, Error}; +use crate::{ + parse::{try_if_readable, EntryParser, IndexParser, InnerParser, Parser, PlainParser}, + record::{Entry, SITEMAP_INDEX, URL_SET}, + Error, +}; /// Sitemap type resolver. // TODO: Check for the plain txt sitemaps. @@ -147,9 +151,9 @@ impl AutoParser { /// Returns minimal (no resolved indexes) total sitemaps amount. pub fn len(&self) -> usize { self.sitemaps.len() - + self.plain.is_some() as usize - + self.index.is_some() as usize - + self.entry.is_some() as usize + + usize::from(self.plain.is_some()) + + usize::from(self.index.is_some()) + + usize::from(self.entry.is_some()) } } @@ -199,7 +203,7 @@ where if let Some(sitemap) = self.sitemaps.pop() { let reader = (fetcher)(sitemap)?; if let Ok(sitemap) = Scanner::from_sync(reader) { - self.replace_parser(sitemap) + self.replace_parser(sitemap); } } diff --git a/sitemapo/parse/entry.rs b/sitemapo/parse/entry.rs index 35ebdf3..150a695 100644 --- a/sitemapo/parse/entry.rs +++ b/sitemapo/parse/entry.rs @@ -8,7 +8,7 @@ use crate::{Error, Result}; /// [`Entry`] builder. #[derive(Debug, Clone, Default)] -pub(crate) struct EntryFactory { +pub struct EntryFactory { location: Option, modified: Option, priority: Option, @@ -78,7 +78,7 @@ impl EntryParser { } /// Creates a new instance with the given inner parser. - pub(crate) fn from_inner(inner: InnerParser) -> Self { + pub(crate) const fn from_inner(inner: InnerParser) -> Self { Self { inner } } @@ -114,17 +114,17 @@ impl EntryParser { } } - pub(crate) fn write_event(&mut self, event: events::Event) -> Result> { + pub(crate) fn write_event(&mut self, event: events::Event) -> Output { let tag = URL.as_bytes(); let builder = self.inner.write_event(event, tag, Self::apply_inner); if let Ok(Output::Some(r)) = builder { if let Some(record) = r.build() { - return Ok(Output::Some(record)); + return Output::Some(record); } } - Ok(Output::None) + Output::None } } @@ -149,7 +149,7 @@ impl Parser for EntryParser { loop { self.inner.try_if_readable()?; let event = self.inner.reader.read_event_into(&mut buf)?; - match self.write_event(event)? { + match self.write_event(event) { Output::Some(record) => return Ok(Some(record)), Output::None => {} Output::End => return Ok(None), diff --git a/sitemapo/parse/index.rs b/sitemapo/parse/index.rs index eb05574..c81b8e2 100644 --- a/sitemapo/parse/index.rs +++ b/sitemapo/parse/index.rs @@ -8,7 +8,7 @@ use crate::{Error, Result}; /// [`Index`] builder. #[derive(Debug, Clone, Default)] -pub(crate) struct IndexFactory { +pub struct IndexFactory { pub(crate) location: Option, pub(crate) modified: Option, } @@ -53,7 +53,7 @@ impl IndexParser { } /// Creates a new instance with the given inner parser. - pub(crate) fn from_inner(inner: InnerParser) -> Self { + pub(crate) const fn from_inner(inner: InnerParser) -> Self { Self { inner } } @@ -85,17 +85,17 @@ impl IndexParser { } } - pub(crate) fn write_event(&mut self, event: events::Event) -> Result> { + pub(crate) fn write_event(&mut self, event: events::Event) -> Output { let tag = SITEMAP.as_bytes(); let builder = self.inner.write_event(event, tag, Self::apply_inner); if let Ok(Output::Some(r)) = builder { if let Some(record) = r.build() { - return Ok(Output::Some(record)); + return Output::Some(record); } } - Ok(Output::None) + Output::None } } @@ -120,7 +120,7 @@ impl Parser for IndexParser { loop { self.inner.try_if_readable()?; let event = self.inner.reader.read_event_into(&mut buf)?; - match self.write_event(event)? { + match self.write_event(event) { Output::Some(record) => return Ok(Some(record)), Output::None => {} Output::End => return Ok(None), diff --git a/sitemapo/parse/inner.rs b/sitemapo/parse/inner.rs index dd0c5d0..851a59f 100644 --- a/sitemapo/parse/inner.rs +++ b/sitemapo/parse/inner.rs @@ -5,7 +5,7 @@ use quick_xml::{events::Event, Reader}; use crate::parse::try_if_readable; use crate::Result; -pub(crate) enum Output { +pub enum Output { /// Next record. Some(T), /// The event didn't result into the new record. @@ -16,11 +16,11 @@ pub(crate) enum Output { impl From> for Output { fn from(value: Option) -> Self { - value.map(Output::Some).unwrap_or(Output::End) + value.map_or(Self::End, Self::Some) } } -pub(crate) struct InnerParser { +pub struct InnerParser { pub(crate) record: Option, pub(crate) reader: Reader>, pub(crate) records: usize, @@ -113,6 +113,6 @@ impl std::fmt::Debug for InnerParser { f.debug_struct("InnerParser") .field("bytes", &self.reader.get_ref().reader_bytes()) .field("records", &self.records) - .finish() + .finish_non_exhaustive() } } diff --git a/sitemapo/parse/mod.rs b/sitemapo/parse/mod.rs index 72057b3..250e030 100644 --- a/sitemapo/parse/mod.rs +++ b/sitemapo/parse/mod.rs @@ -1,15 +1,15 @@ -mod auto; -mod entry; -mod index; -mod inner; -mod plain; - pub use auto::*; pub use entry::*; pub use index::*; pub(crate) use inner::*; pub use plain::*; +mod auto; +mod entry; +mod index; +mod inner; +mod plain; + /// Core trait for the parser implementation. pub trait Parser: Sized { type Error: std::error::Error; @@ -41,7 +41,7 @@ pub trait AsyncParser: Sized { async fn close(self) -> Result; } -pub(crate) fn try_if_readable(records: usize, bytes: usize) -> crate::Result<()> { +pub(crate) const fn try_if_readable(records: usize, bytes: usize) -> crate::Result<()> { use crate::record::{BYTE_LIMIT, RECORD_LIMIT}; if records + 1 > RECORD_LIMIT { diff --git a/sitemapo/record/entry.rs b/sitemapo/record/entry.rs index 0fb48d4..0ae6d4d 100644 --- a/sitemapo/record/entry.rs +++ b/sitemapo/record/entry.rs @@ -15,6 +15,7 @@ use crate::record::{Frequency, Priority}; /// .with_priority(Priority::MAX) /// .with_frequency(Frequency::Daily); /// ``` +#[must_use] #[derive(Debug, Clone)] pub struct Entry { pub location: Url, @@ -25,7 +26,7 @@ pub struct Entry { impl Entry { /// Creates a new instance with the given location. - pub fn new(location: Url) -> Self { + pub const fn new(location: Url) -> Self { Self { location, modified: None, @@ -35,19 +36,19 @@ impl Entry { } /// Creates a new record with the given modify timestamp. - pub fn with_modified(mut self, modified: OffsetDateTime) -> Self { + pub const fn with_modified(mut self, modified: OffsetDateTime) -> Self { self.modified = Some(modified); self } /// Creates a new record with the given priority. - pub fn with_priority(mut self, priority: Priority) -> Self { + pub const fn with_priority(mut self, priority: Priority) -> Self { self.priority = Some(priority); self } /// Creates a new record with the given change frequency. - pub fn with_frequency(mut self, frequency: Frequency) -> Self { + pub const fn with_frequency(mut self, frequency: Frequency) -> Self { self.frequency = Some(frequency); self } @@ -55,6 +56,6 @@ impl Entry { impl From for Entry { fn from(location: Url) -> Self { - Entry::new(location) + Self::new(location) } } diff --git a/sitemapo/record/frequency.rs b/sitemapo/record/frequency.rs index 649a56c..924ead2 100644 --- a/sitemapo/record/frequency.rs +++ b/sitemapo/record/frequency.rs @@ -12,6 +12,7 @@ pub struct FrequencyError; /// /// This value provides general information to search engines and /// may not correlate exactly to how often they crawl the page. +#[must_use] #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum Frequency { /// Describes documents that change each time they are accessed. @@ -40,17 +41,14 @@ impl Frequency { /// assert_eq!(frequency.unwrap(), Frequency::Daily); /// ``` pub fn parse(frequency: &str) -> Result { - let frequency = frequency.trim().to_lowercase(); - - use Frequency::*; - match frequency.as_str() { - "always" => Ok(Always), - "hourly" => Ok(Hourly), - "daily" => Ok(Daily), - "weekly" => Ok(Weekly), - "monthly" => Ok(Monthly), - "yearly" => Ok(Yearly), - "never" => Ok(Never), + match frequency.trim().to_lowercase().as_str() { + "always" => Ok(Self::Always), + "hourly" => Ok(Self::Hourly), + "daily" => Ok(Self::Daily), + "weekly" => Ok(Self::Weekly), + "monthly" => Ok(Self::Monthly), + "yearly" => Ok(Self::Yearly), + "never" => Ok(Self::Never), _ => Err(FrequencyError), } } @@ -66,15 +64,15 @@ impl Frequency { /// let rs = Frequency::Monthly.next_date(d0); /// assert_eq!(rs.unwrap(), datetime!(2022-10-12 12:00 UTC)) /// ``` + #[must_use] pub fn next_date(&self, date: OffsetDateTime) -> Option { - use Frequency::*; match &self { - Always | Never => None, - Hourly => Some(date + 1.hours()), - Daily => Some(date + 1.days()), - Weekly => Some(date + 7.days()), - Monthly => Some(date + 30.days()), - Yearly => Some(date + 365.days()), + Self::Always | Self::Never => None, + Self::Hourly => Some(date + 1.hours()), + Self::Daily => Some(date + 1.days()), + Self::Weekly => Some(date + 7.days()), + Self::Monthly => Some(date + 30.days()), + Self::Yearly => Some(date + 365.days()), } } @@ -88,14 +86,14 @@ impl Frequency { /// let d1 = datetime!(2022-10-12 12:00 UTC); /// assert!(Frequency::Monthly.is_outdated(d0, d1)); /// ``` + #[must_use] pub fn is_outdated(&self, date: OffsetDateTime, now: OffsetDateTime) -> bool { match &self { Self::Always => true, Self::Never => false, - _ => match self.next_date(date) { - Some(next) => next <= now, - None => unreachable!(), - }, + _ => self + .next_date(date) + .map_or_else(|| unreachable!(), |x| x <= now), } } } diff --git a/sitemapo/record/index.rs b/sitemapo/record/index.rs index 716a9ae..b54812b 100644 --- a/sitemapo/record/index.rs +++ b/sitemapo/record/index.rs @@ -11,6 +11,7 @@ use url::Url; /// let _ = Index::new(Url::parse("https://example.com/").unwrap()) /// .with_modified(datetime!(2020-01-01 0:00 UTC)); /// ``` +#[must_use] #[derive(Debug, Clone)] pub struct Index { pub location: Url, @@ -19,7 +20,7 @@ pub struct Index { impl Index { /// Creates a new record with the given location. - pub fn new(location: Url) -> Self { + pub const fn new(location: Url) -> Self { Self { location, modified: None, @@ -27,11 +28,9 @@ impl Index { } /// Creates a new record with the given modify timestamp. - pub fn with_modified(self, modified: OffsetDateTime) -> Self { - Self { - modified: Some(modified), - ..self - } + pub const fn with_modified(mut self, modified: OffsetDateTime) -> Self { + self.modified = Some(modified); + self } } diff --git a/sitemapo/record/priority.rs b/sitemapo/record/priority.rs index f0aef52..dbbd5eb 100644 --- a/sitemapo/record/priority.rs +++ b/sitemapo/record/priority.rs @@ -17,6 +17,7 @@ pub enum PriorityError { /// Valid values range from 0.0 to 1.0. This value does not affect how your /// pages are compared to pages on other sites. It only lets the search engines /// know which pages you deem most important for the crawlers. +#[must_use] #[derive(Debug, Clone, Copy, PartialEq)] pub struct Priority(f32); @@ -46,7 +47,7 @@ impl Priority { /// assert_eq!(frequency.as_inner(), 1.0); /// ``` pub fn new_fallback(priority: f32) -> Self { - Self(priority.max(0.0).min(1.0)) + Self(priority.clamp(0.0, 1.0)) } /// Tries to parse the string into the valid priority value. @@ -63,7 +64,8 @@ impl Priority { } /// Returns the internal value. - pub fn as_inner(&self) -> f32 { + #[must_use] + pub const fn as_inner(&self) -> f32 { self.0 } From d5640b6255c5399c852287abdf221277986801f3 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Wed, 26 Jun 2024 21:38:02 +0200 Subject: [PATCH 10/11] feat(all): minor fixes --- Cargo.toml | 2 +- README.md | 3 +- countio/Cargo.toml | 10 +++---- robotxt/Cargo.toml | 8 ++--- robotxt/README.md | 65 +++++++++++++++++++---------------------- robotxt/lib.rs | 7 ++++- robotxt/parse/access.rs | 16 +++++++++- robotxt/paths/create.rs | 26 +++++++++++++---- sitemapo/Cargo.toml | 6 ++-- sitemapo/README.md | 4 +-- sitemapo/parse/entry.rs | 2 +- sitemapo/parse/index.rs | 2 +- 12 files changed, 89 insertions(+), 62 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ecb42d5..12eebc3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ authors = ["Oleh Martsokha "] license = "MIT" [workspace.dependencies] -tokio = { version = "1.38", default-features = false } +tokio = { version = "1", default-features = false } futures-io = { version = "0.3", default-features = false } futures-util = { version = "0.3", default-features = false } futures-test = { version = "0.3", default-features = false } diff --git a/README.md b/README.md index f490897..f0aa6ac 100644 --- a/README.md +++ b/README.md @@ -17,5 +17,4 @@ protocol with the support of `crawl-delay`, `sitemap` and universal `*` match extensions. - [sitemapo](./sitemapo/): The implementation of the Sitemap (or URL inclusion) - protocol with the support of txt, xml formats and video, image, and news - extensions. + protocol with the support of txt and xml formats. diff --git a/countio/Cargo.toml b/countio/Cargo.toml index fb9f3d8..b487537 100644 --- a/countio/Cargo.toml +++ b/countio/Cargo.toml @@ -35,10 +35,10 @@ tokio = ["dep:tokio"] futures = ["dep:futures-io"] [dependencies] -tokio = { workspace = true, optional = true } -futures-io = { workspace = true, optional = true, features = ["std"] } +tokio = { version = "1", default-features = false, optional = true } +futures-io = { version = "0.3", default-features = false, optional = true, features = ["std"] } [dev-dependencies] -tokio = { workspace = true, features = ["rt", "macros", "io-util"] } -futures-util = { workspace = true } -futures-test = { workspace = true, features = ["std"] } +tokio = { version = "1", features = ["rt", "macros", "io-util"] } +futures-util = { version = "0.3", default-features = false } +futures-test = { version = "0.3", default-features = false, features = ["std"] } diff --git a/robotxt/Cargo.toml b/robotxt/Cargo.toml index 8338574..8d2abfe 100644 --- a/robotxt/Cargo.toml +++ b/robotxt/Cargo.toml @@ -45,14 +45,14 @@ optimal = [] serde = ["dep:serde", "url/serde", "serde/derive", "serde/rc"] [dependencies] -url = { workspace = true } -thiserror = { workspace = true } +url = { version = "2.5" } +thiserror = { version = "1.0" } percent-encoding = { version = "2.3" } nom = { version = "7.1", optional = true } bstr = { version = "1.9", optional = true } regex = { version = "1.10", optional = true } -serde = { workspace = true, optional = true } +serde = { version = "1.0", optional = true } [dev-dependencies] -serde_json = { workspace = true } +serde_json = { version = "1.0" } diff --git a/robotxt/README.md b/robotxt/README.md index f5ff32b..88c5722 100644 --- a/robotxt/README.md +++ b/robotxt/README.md @@ -37,19 +37,17 @@ programming language with the support of `crawl-delay`, `sitemap` and universal ```rust use robotxt::Robots; -fn main() { - let txt = r#" - User-Agent: foobot - Disallow: * - Allow: /example/ - Disallow: /example/nope.txt - "#; - - let r = Robots::from_bytes(txt.as_bytes(), "foobot"); - assert!(r.is_relative_allowed("/example/yeah.txt")); - assert!(!r.is_relative_allowed("/example/nope.txt")); - assert!(!r.is_relative_allowed("/invalid/path.txt")); -} +let txt = r#" + User-Agent: foobot + Disallow: * + Allow: /example/ + Disallow: /example/nope.txt +"#; + +let r = Robots::from_bytes(txt.as_bytes(), "foobot"); +assert!(r.is_relative_allowed("/example/yeah.txt")); +assert!(!r.is_relative_allowed("/example/nope.txt")); +assert!(!r.is_relative_allowed("/invalid/path.txt")); ``` - build the new `robots.txt` file in a declarative manner: @@ -57,28 +55,25 @@ fn main() { ```rust use robotxt::{RobotsBuilder, Result}; -fn main() -> Result<()> { - let txt = RobotsBuilder::default() - .header("Robots.txt: Start") - .group(["foobot"], |u| { - u.crawl_delay(5) - .header("Rules for Foobot: Start") - .allow("/example/yeah.txt") - .disallow("/example/nope.txt") - .footer("Rules for Foobot: End") - }) - .group(["barbot", "nombot"], |u| { - u.crawl_delay(2) - .disallow("/example/yeah.txt") - .disallow("/example/nope.txt") - }) - .sitemap("https://example.com/sitemap_1.xml".try_into()?) - .sitemap("https://example.com/sitemap_1.xml".try_into()?) - .footer("Robots.txt: End"); - - println!("{}", txt.to_string()); - Ok(()) -} +let txt = RobotsBuilder::default() + .header("Robots.txt: Start") + .group(["foobot"], |u| { + u.crawl_delay(5) + .header("Rules for Foobot: Start") + .allow("/example/yeah.txt") + .disallow("/example/nope.txt") + .footer("Rules for Foobot: End") + }) + .group(["barbot", "nombot"], |u| { + u.crawl_delay(2) + .disallow("/example/yeah.txt") + .disallow("/example/nope.txt") + }) + .sitemap("https://example.com/sitemap_1.xml".try_into()?) + .sitemap("https://example.com/sitemap_1.xml".try_into()?) + .footer("Robots.txt: End"); + +println!("{}", txt.to_string()); ``` ### Links diff --git a/robotxt/lib.rs b/robotxt/lib.rs index b03ba45..cf58df9 100644 --- a/robotxt/lib.rs +++ b/robotxt/lib.rs @@ -23,9 +23,14 @@ pub enum Error { #[error("cannot be a base url")] CannotBeBase, + /// Unable to create the expected path to the `robots.txt` file: + /// does not have a host. + #[error("does not have a host")] + NoHost, + /// Unable to create the expected path to the `robots.txt` file: /// unexpected address scheme, expected `http` or `https`. - #[error("addr scheme: `{scheme}`, expected `http` or `https`")] + #[error("scheme: `{scheme}`, expected `http` or `https`")] WrongScheme { scheme: String }, /// Unable to create the expected path to the `robots.txt` file: diff --git a/robotxt/parse/access.rs b/robotxt/parse/access.rs index acd64f8..0fe8683 100644 --- a/robotxt/parse/access.rs +++ b/robotxt/parse/access.rs @@ -1,4 +1,5 @@ use std::fmt; +use std::ops::Deref; /// The result of the `robots.txt` retrieval attempt. /// @@ -6,13 +7,14 @@ use std::fmt; /// Also see 2.3.1. Access Results in the specification. /// /// [`Robots::from_access`]: crate::Robots::from_access -#[derive(Debug)] +#[derive(Debug, Copy, Clone)] pub enum AccessResult<'a> { /// 2.3.1.1. Successful Access /// /// If the crawler successfully downloads the robots.txt file, the /// crawler MUST follow the parseable rules. Successful(&'a [u8]), + /// 2.3.1.2. Redirects /// /// It's possible that a server responds to a robots.txt fetch request @@ -27,6 +29,7 @@ pub enum AccessResult<'a> { /// If there are more than five consecutive redirects, crawlers MAY /// assume that the robots.txt file is unavailable. Redirect, + /// 2.3.1.3. "Unavailable" Status /// /// "Unavailable" means the crawler tries to fetch the robots.txt file @@ -38,6 +41,7 @@ pub enum AccessResult<'a> { /// unavailable to the crawler, then the crawler MAY access any resources /// on the server. Unavailable, + /// 2.3.1.4. "Unreachable" Status /// /// If the robots.txt file is unreachable due to server or network @@ -65,7 +69,17 @@ impl AccessResult<'_> { } } +impl Deref for AccessResult<'_> { + type Target = str; + + #[inline] + fn deref(&self) -> &Self::Target { + self.as_str() + } +} + impl fmt::Display for AccessResult<'_> { + #[inline] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}", self.as_str()) } diff --git a/robotxt/paths/create.rs b/robotxt/paths/create.rs index e7be5e9..600a686 100644 --- a/robotxt/paths/create.rs +++ b/robotxt/paths/create.rs @@ -1,7 +1,16 @@ +use url::Url; + use crate::{Error, Result}; /// Returns the expected path to the `robots.txt` file -/// as the [`url::Url`]. +/// as the `url::`[`Url`]. +/// +/// # Errors +/// +/// Returns the error if the provided [`Url`] cannot be a base, +/// does not have a host or the schema is not `http` or `https`. +/// +/// # Examples /// /// ```rust /// use url::Url; @@ -12,13 +21,17 @@ use crate::{Error, Result}; /// let robots = create_url(&path).unwrap().to_string(); /// assert_eq!(robots, "https://example.com/robots.txt") /// ``` -pub fn create_url(path: &url::Url) -> Result { +pub fn create_url(path: &Url) -> Result { let mut path = path.clone(); if path.cannot_be_a_base() { return Err(Error::CannotBeBase); } + if path.host().is_none() { + return Err(Error::NoHost); + } + if path.scheme() != "http" && path.scheme() != "https" { return Err(Error::WrongScheme { scheme: path.scheme().to_string(), @@ -26,11 +39,12 @@ pub fn create_url(path: &url::Url) -> Result { } if !path.username().is_empty() { - path.set_username("").unwrap(); + path.set_username("").expect("should pass base/host tests"); } if path.password().is_some() { - path.set_password(None).unwrap(); + path.set_password(None) + .expect("should pass base/host tests"); } path.join("/robots.txt").map_err(Into::into) @@ -38,12 +52,12 @@ pub fn create_url(path: &url::Url) -> Result { #[cfg(test)] mod test { - use super::*; + use crate::{create_url, url::Url, Result}; #[test] fn from_url() -> Result<()> { let path = "https://user:pass@example.com/foo/sample.txt"; - let path = url::Url::parse(path).unwrap(); + let path = Url::parse(path).unwrap(); let robots = create_url(&path)?.to_string(); assert_eq!(robots, "https://example.com/robots.txt"); diff --git a/sitemapo/Cargo.toml b/sitemapo/Cargo.toml index 702af36..44683c0 100644 --- a/sitemapo/Cargo.toml +++ b/sitemapo/Cargo.toml @@ -15,8 +15,8 @@ documentation = "https://docs.rs/sitemapo" categories = ["parser-implementations", "web-programming"] keywords = ["sitemaps", "sitemap", "inclusion", "crawler", "scraper"] description = """ -The implementation of the Sitemap.xml (or URL inclusion) protocol with -the support of txt & xml formats, and video, image, news extensions. +The implementation of the Sitemap.xml (or URL inclusion) protocol +with the support of txt & xml formats. """ [package.metadata.docs.rs] @@ -44,7 +44,7 @@ url = { workspace = true } thiserror = { workspace = true } countio = { version = "0.2" } -quick-xml = { version = "0.31" } +quick-xml = { version = "0.32" } bytes = { version = "1.6", features = [] } time = { workspace = true, features = ["parsing", "formatting"] } diff --git a/sitemapo/README.md b/sitemapo/README.md index eaa4db6..04d61a5 100644 --- a/sitemapo/README.md +++ b/sitemapo/README.md @@ -17,8 +17,8 @@ [coverage-url]: https://app.codecov.io/gh/spire-rs/kit The implementation of the Sitemap (or URL inclusion) protocol in the Rust -programming language with the support of `txt` & `xml` formats, and `video`, -`image`, `news` extensions (according to the Google's spec). +programming language with the support of `txt` & `xml` formats (according to the +Google's spec). ### Features diff --git a/sitemapo/parse/entry.rs b/sitemapo/parse/entry.rs index 150a695..249a3e6 100644 --- a/sitemapo/parse/entry.rs +++ b/sitemapo/parse/entry.rs @@ -185,7 +185,7 @@ mod async_parser { loop { self.inner.try_if_readable()?; let event = self.inner.reader.read_event_into_async(&mut buf).await?; - match self.write_event(event)? { + match self.write_event(event) { Output::Some(record) => return Ok(Some(record)), Output::None => {} Output::End => return Ok(None), diff --git a/sitemapo/parse/index.rs b/sitemapo/parse/index.rs index c81b8e2..77bf4d4 100644 --- a/sitemapo/parse/index.rs +++ b/sitemapo/parse/index.rs @@ -156,7 +156,7 @@ mod tokio { loop { self.inner.try_if_readable()?; let event = self.inner.reader.read_event_into_async(&mut buf).await?; - match self.write_event(event)? { + match self.write_event(event) { Output::Some(record) => return Ok(Some(record)), Output::None => {} Output::End => return Ok(None), From eea875a32e2cf94ed18e96f669de993d031b5ede Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Wed, 26 Jun 2024 21:40:01 +0200 Subject: [PATCH 11/11] feat(all): bump & release --- countio/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/countio/Cargo.toml b/countio/Cargo.toml index b487537..7bf21ef 100644 --- a/countio/Cargo.toml +++ b/countio/Cargo.toml @@ -2,7 +2,7 @@ [package] name = "countio" -version = "0.2.17" +version = "0.2.18" readme = "./README.md" edition = { workspace = true }