From d5640b6255c5399c852287abdf221277986801f3 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Wed, 26 Jun 2024 21:38:02 +0200 Subject: [PATCH] feat(all): minor fixes --- Cargo.toml | 2 +- README.md | 3 +- countio/Cargo.toml | 10 +++---- robotxt/Cargo.toml | 8 ++--- robotxt/README.md | 65 +++++++++++++++++++---------------------- robotxt/lib.rs | 7 ++++- robotxt/parse/access.rs | 16 +++++++++- robotxt/paths/create.rs | 26 +++++++++++++---- sitemapo/Cargo.toml | 6 ++-- sitemapo/README.md | 4 +-- sitemapo/parse/entry.rs | 2 +- sitemapo/parse/index.rs | 2 +- 12 files changed, 89 insertions(+), 62 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ecb42d5..12eebc3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ authors = ["Oleh Martsokha "] license = "MIT" [workspace.dependencies] -tokio = { version = "1.38", default-features = false } +tokio = { version = "1", default-features = false } futures-io = { version = "0.3", default-features = false } futures-util = { version = "0.3", default-features = false } futures-test = { version = "0.3", default-features = false } diff --git a/README.md b/README.md index f490897..f0aa6ac 100644 --- a/README.md +++ b/README.md @@ -17,5 +17,4 @@ protocol with the support of `crawl-delay`, `sitemap` and universal `*` match extensions. - [sitemapo](./sitemapo/): The implementation of the Sitemap (or URL inclusion) - protocol with the support of txt, xml formats and video, image, and news - extensions. + protocol with the support of txt and xml formats. diff --git a/countio/Cargo.toml b/countio/Cargo.toml index fb9f3d8..b487537 100644 --- a/countio/Cargo.toml +++ b/countio/Cargo.toml @@ -35,10 +35,10 @@ tokio = ["dep:tokio"] futures = ["dep:futures-io"] [dependencies] -tokio = { workspace = true, optional = true } -futures-io = { workspace = true, optional = true, features = ["std"] } +tokio = { version = "1", default-features = false, optional = true } +futures-io = { version = "0.3", default-features = false, optional = true, features = ["std"] } [dev-dependencies] -tokio = { workspace = true, features = ["rt", "macros", "io-util"] } -futures-util = { workspace = true } -futures-test = { workspace = true, features = ["std"] } +tokio = { version = "1", features = ["rt", "macros", "io-util"] } +futures-util = { version = "0.3", default-features = false } +futures-test = { version = "0.3", default-features = false, features = ["std"] } diff --git a/robotxt/Cargo.toml b/robotxt/Cargo.toml index 8338574..8d2abfe 100644 --- a/robotxt/Cargo.toml +++ b/robotxt/Cargo.toml @@ -45,14 +45,14 @@ optimal = [] serde = ["dep:serde", "url/serde", "serde/derive", "serde/rc"] [dependencies] -url = { workspace = true } -thiserror = { workspace = true } +url = { version = "2.5" } +thiserror = { version = "1.0" } percent-encoding = { version = "2.3" } nom = { version = "7.1", optional = true } bstr = { version = "1.9", optional = true } regex = { version = "1.10", optional = true } -serde = { workspace = true, optional = true } +serde = { version = "1.0", optional = true } [dev-dependencies] -serde_json = { workspace = true } +serde_json = { version = "1.0" } diff --git a/robotxt/README.md b/robotxt/README.md index f5ff32b..88c5722 100644 --- a/robotxt/README.md +++ b/robotxt/README.md @@ -37,19 +37,17 @@ programming language with the support of `crawl-delay`, `sitemap` and universal ```rust use robotxt::Robots; -fn main() { - let txt = r#" - User-Agent: foobot - Disallow: * - Allow: /example/ - Disallow: /example/nope.txt - "#; - - let r = Robots::from_bytes(txt.as_bytes(), "foobot"); - assert!(r.is_relative_allowed("/example/yeah.txt")); - assert!(!r.is_relative_allowed("/example/nope.txt")); - assert!(!r.is_relative_allowed("/invalid/path.txt")); -} +let txt = r#" + User-Agent: foobot + Disallow: * + Allow: /example/ + Disallow: /example/nope.txt +"#; + +let r = Robots::from_bytes(txt.as_bytes(), "foobot"); +assert!(r.is_relative_allowed("/example/yeah.txt")); +assert!(!r.is_relative_allowed("/example/nope.txt")); +assert!(!r.is_relative_allowed("/invalid/path.txt")); ``` - build the new `robots.txt` file in a declarative manner: @@ -57,28 +55,25 @@ fn main() { ```rust use robotxt::{RobotsBuilder, Result}; -fn main() -> Result<()> { - let txt = RobotsBuilder::default() - .header("Robots.txt: Start") - .group(["foobot"], |u| { - u.crawl_delay(5) - .header("Rules for Foobot: Start") - .allow("/example/yeah.txt") - .disallow("/example/nope.txt") - .footer("Rules for Foobot: End") - }) - .group(["barbot", "nombot"], |u| { - u.crawl_delay(2) - .disallow("/example/yeah.txt") - .disallow("/example/nope.txt") - }) - .sitemap("https://example.com/sitemap_1.xml".try_into()?) - .sitemap("https://example.com/sitemap_1.xml".try_into()?) - .footer("Robots.txt: End"); - - println!("{}", txt.to_string()); - Ok(()) -} +let txt = RobotsBuilder::default() + .header("Robots.txt: Start") + .group(["foobot"], |u| { + u.crawl_delay(5) + .header("Rules for Foobot: Start") + .allow("/example/yeah.txt") + .disallow("/example/nope.txt") + .footer("Rules for Foobot: End") + }) + .group(["barbot", "nombot"], |u| { + u.crawl_delay(2) + .disallow("/example/yeah.txt") + .disallow("/example/nope.txt") + }) + .sitemap("https://example.com/sitemap_1.xml".try_into()?) + .sitemap("https://example.com/sitemap_1.xml".try_into()?) + .footer("Robots.txt: End"); + +println!("{}", txt.to_string()); ``` ### Links diff --git a/robotxt/lib.rs b/robotxt/lib.rs index b03ba45..cf58df9 100644 --- a/robotxt/lib.rs +++ b/robotxt/lib.rs @@ -23,9 +23,14 @@ pub enum Error { #[error("cannot be a base url")] CannotBeBase, + /// Unable to create the expected path to the `robots.txt` file: + /// does not have a host. + #[error("does not have a host")] + NoHost, + /// Unable to create the expected path to the `robots.txt` file: /// unexpected address scheme, expected `http` or `https`. - #[error("addr scheme: `{scheme}`, expected `http` or `https`")] + #[error("scheme: `{scheme}`, expected `http` or `https`")] WrongScheme { scheme: String }, /// Unable to create the expected path to the `robots.txt` file: diff --git a/robotxt/parse/access.rs b/robotxt/parse/access.rs index acd64f8..0fe8683 100644 --- a/robotxt/parse/access.rs +++ b/robotxt/parse/access.rs @@ -1,4 +1,5 @@ use std::fmt; +use std::ops::Deref; /// The result of the `robots.txt` retrieval attempt. /// @@ -6,13 +7,14 @@ use std::fmt; /// Also see 2.3.1. Access Results in the specification. /// /// [`Robots::from_access`]: crate::Robots::from_access -#[derive(Debug)] +#[derive(Debug, Copy, Clone)] pub enum AccessResult<'a> { /// 2.3.1.1. Successful Access /// /// If the crawler successfully downloads the robots.txt file, the /// crawler MUST follow the parseable rules. Successful(&'a [u8]), + /// 2.3.1.2. Redirects /// /// It's possible that a server responds to a robots.txt fetch request @@ -27,6 +29,7 @@ pub enum AccessResult<'a> { /// If there are more than five consecutive redirects, crawlers MAY /// assume that the robots.txt file is unavailable. Redirect, + /// 2.3.1.3. "Unavailable" Status /// /// "Unavailable" means the crawler tries to fetch the robots.txt file @@ -38,6 +41,7 @@ pub enum AccessResult<'a> { /// unavailable to the crawler, then the crawler MAY access any resources /// on the server. Unavailable, + /// 2.3.1.4. "Unreachable" Status /// /// If the robots.txt file is unreachable due to server or network @@ -65,7 +69,17 @@ impl AccessResult<'_> { } } +impl Deref for AccessResult<'_> { + type Target = str; + + #[inline] + fn deref(&self) -> &Self::Target { + self.as_str() + } +} + impl fmt::Display for AccessResult<'_> { + #[inline] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}", self.as_str()) } diff --git a/robotxt/paths/create.rs b/robotxt/paths/create.rs index e7be5e9..600a686 100644 --- a/robotxt/paths/create.rs +++ b/robotxt/paths/create.rs @@ -1,7 +1,16 @@ +use url::Url; + use crate::{Error, Result}; /// Returns the expected path to the `robots.txt` file -/// as the [`url::Url`]. +/// as the `url::`[`Url`]. +/// +/// # Errors +/// +/// Returns the error if the provided [`Url`] cannot be a base, +/// does not have a host or the schema is not `http` or `https`. +/// +/// # Examples /// /// ```rust /// use url::Url; @@ -12,13 +21,17 @@ use crate::{Error, Result}; /// let robots = create_url(&path).unwrap().to_string(); /// assert_eq!(robots, "https://example.com/robots.txt") /// ``` -pub fn create_url(path: &url::Url) -> Result { +pub fn create_url(path: &Url) -> Result { let mut path = path.clone(); if path.cannot_be_a_base() { return Err(Error::CannotBeBase); } + if path.host().is_none() { + return Err(Error::NoHost); + } + if path.scheme() != "http" && path.scheme() != "https" { return Err(Error::WrongScheme { scheme: path.scheme().to_string(), @@ -26,11 +39,12 @@ pub fn create_url(path: &url::Url) -> Result { } if !path.username().is_empty() { - path.set_username("").unwrap(); + path.set_username("").expect("should pass base/host tests"); } if path.password().is_some() { - path.set_password(None).unwrap(); + path.set_password(None) + .expect("should pass base/host tests"); } path.join("/robots.txt").map_err(Into::into) @@ -38,12 +52,12 @@ pub fn create_url(path: &url::Url) -> Result { #[cfg(test)] mod test { - use super::*; + use crate::{create_url, url::Url, Result}; #[test] fn from_url() -> Result<()> { let path = "https://user:pass@example.com/foo/sample.txt"; - let path = url::Url::parse(path).unwrap(); + let path = Url::parse(path).unwrap(); let robots = create_url(&path)?.to_string(); assert_eq!(robots, "https://example.com/robots.txt"); diff --git a/sitemapo/Cargo.toml b/sitemapo/Cargo.toml index 702af36..44683c0 100644 --- a/sitemapo/Cargo.toml +++ b/sitemapo/Cargo.toml @@ -15,8 +15,8 @@ documentation = "https://docs.rs/sitemapo" categories = ["parser-implementations", "web-programming"] keywords = ["sitemaps", "sitemap", "inclusion", "crawler", "scraper"] description = """ -The implementation of the Sitemap.xml (or URL inclusion) protocol with -the support of txt & xml formats, and video, image, news extensions. +The implementation of the Sitemap.xml (or URL inclusion) protocol +with the support of txt & xml formats. """ [package.metadata.docs.rs] @@ -44,7 +44,7 @@ url = { workspace = true } thiserror = { workspace = true } countio = { version = "0.2" } -quick-xml = { version = "0.31" } +quick-xml = { version = "0.32" } bytes = { version = "1.6", features = [] } time = { workspace = true, features = ["parsing", "formatting"] } diff --git a/sitemapo/README.md b/sitemapo/README.md index eaa4db6..04d61a5 100644 --- a/sitemapo/README.md +++ b/sitemapo/README.md @@ -17,8 +17,8 @@ [coverage-url]: https://app.codecov.io/gh/spire-rs/kit The implementation of the Sitemap (or URL inclusion) protocol in the Rust -programming language with the support of `txt` & `xml` formats, and `video`, -`image`, `news` extensions (according to the Google's spec). +programming language with the support of `txt` & `xml` formats (according to the +Google's spec). ### Features diff --git a/sitemapo/parse/entry.rs b/sitemapo/parse/entry.rs index 150a695..249a3e6 100644 --- a/sitemapo/parse/entry.rs +++ b/sitemapo/parse/entry.rs @@ -185,7 +185,7 @@ mod async_parser { loop { self.inner.try_if_readable()?; let event = self.inner.reader.read_event_into_async(&mut buf).await?; - match self.write_event(event)? { + match self.write_event(event) { Output::Some(record) => return Ok(Some(record)), Output::None => {} Output::End => return Ok(None), diff --git a/sitemapo/parse/index.rs b/sitemapo/parse/index.rs index c81b8e2..77bf4d4 100644 --- a/sitemapo/parse/index.rs +++ b/sitemapo/parse/index.rs @@ -156,7 +156,7 @@ mod tokio { loop { self.inner.try_if_readable()?; let event = self.inner.reader.read_event_into_async(&mut buf).await?; - match self.write_event(event)? { + match self.write_event(event) { Output::Some(record) => return Ok(Some(record)), Output::None => {} Output::End => return Ok(None),