-
-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
14 changed files
with
1,183 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
[package] | ||
name = "rust-search-extension" | ||
version = "0.1.0" | ||
authors = ["Folyd <[email protected]>"] | ||
edition = "2021" | ||
publish = false | ||
|
||
[dependencies] | ||
serde = { version = "1", features = ["derive"] } | ||
serde_json = "1" | ||
minifier = "0.3" | ||
unicode-segmentation = "1" | ||
futures = "0.3" | ||
tokio = { version = "1", features = ["macros", "time", "rt-multi-thread"] } | ||
reqwest = { version = "0.11", features = ["json"] } | ||
select = "0.6" | ||
semver = { version = "1", features = ["serde"] } | ||
rayon = "1" | ||
regex = "1" | ||
argh = "0.1" | ||
rustsec = "0" | ||
html-escape = "0" | ||
db-dump = "0.7.1" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
#[derive(Debug)] | ||
pub struct FrequencyWord { | ||
pub word: String, | ||
pub frequency: usize, | ||
} | ||
|
||
impl FrequencyWord { | ||
#[inline] | ||
pub fn score(&self) -> usize { | ||
// Due to the prefix + suffix occupying two letters, | ||
// we should minus the length to calculate the score. | ||
// This will lead to a 0.4% reduction in file size. | ||
(self.word.len() - 2) * self.frequency | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
use std::error::Error; | ||
|
||
use argh::FromArgs; | ||
|
||
use crate::tasks::*; | ||
|
||
mod frequency; | ||
mod minify; | ||
mod tasks; | ||
|
||
/// Options | ||
#[derive(FromArgs)] | ||
struct Options { | ||
#[argh(subcommand)] | ||
subcommand: Subcommand, | ||
} | ||
|
||
/// Subcommands | ||
#[derive(FromArgs)] | ||
#[argh(subcommand)] | ||
#[non_exhaustive] | ||
enum Subcommand { | ||
Advisory(AdvisoryTask), | ||
Crates(CratesTask), | ||
Books(BooksTask), | ||
Caniuse(CaniuseTask), | ||
Lints(LintsTask), | ||
Labels(LabelsTask), | ||
Rfcs(RfcsTask), | ||
Rustc(RustcTask), | ||
Targets(TargetsTask), | ||
} | ||
|
||
pub type Result<T> = std::result::Result<T, Box<dyn Error>>; | ||
|
||
fn main() -> Result<()> { | ||
let options: Options = argh::from_env(); | ||
match options.subcommand { | ||
Subcommand::Advisory(cmd) => cmd.execute()?, | ||
Subcommand::Crates(cmd) => cmd.execute()?, | ||
Subcommand::Books(cmd) => cmd.execute()?, | ||
Subcommand::Caniuse(cmd) => cmd.execute()?, | ||
Subcommand::Lints(cmd) => cmd.execute()?, | ||
Subcommand::Labels(cmd) => cmd.execute()?, | ||
Subcommand::Rfcs(cmd) => cmd.execute()?, | ||
Subcommand::Rustc(cmd) => cmd.execute()?, | ||
Subcommand::Targets(cmd) => cmd.execute()?, | ||
} | ||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
use minifier::js::{ | ||
aggregate_strings_into_array_filter, simple_minify, Keyword, ReservedChar, Token, Tokens, | ||
}; | ||
use rayon::prelude::*; | ||
use std::collections::HashMap; | ||
use std::ops::Deref; | ||
use unicode_segmentation::UnicodeSegmentation; | ||
|
||
use crate::frequency::FrequencyWord; | ||
|
||
#[derive(Debug)] | ||
pub struct Minifier<'a> { | ||
// A word to keys mapping. Such as <"cargo", "$0">. | ||
mapping: HashMap<&'a str, String>, | ||
} | ||
|
||
impl<'a> Minifier<'a> { | ||
const PREFIX: &'static str = "@$^&"; | ||
const SUFFIX: &'static str = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; | ||
|
||
pub fn new(frequency_words: &'a [FrequencyWord]) -> Minifier<'a> { | ||
let keys: Vec<String> = Self::PREFIX | ||
.chars() | ||
.flat_map(|prefix| { | ||
Self::SUFFIX | ||
.chars() | ||
.map(|suffix| format!("{prefix}{suffix}")) | ||
.collect::<Vec<String>>() | ||
}) | ||
.collect(); | ||
|
||
let words = frequency_words | ||
.into_par_iter() | ||
.take(keys.len()) | ||
.collect::<Vec<_>>(); | ||
|
||
Minifier { | ||
mapping: words | ||
.into_par_iter() | ||
.enumerate() | ||
.map(|(index, fw)| (fw.word.as_str(), keys.get(index).unwrap().to_owned())) | ||
.collect(), | ||
} | ||
} | ||
|
||
// Get the key to word mapping to help Javascript to decode the minified string. | ||
pub fn get_key_to_word_mapping(&self) -> HashMap<String, String> { | ||
self.mapping | ||
.iter() | ||
.map(|(key, value)| (value.to_owned(), (*key).to_owned())) | ||
.collect() | ||
} | ||
|
||
#[inline] | ||
pub fn minify_crate_name(&self, name: &str) -> String { | ||
let vec: Vec<&str> = name | ||
.split(|c| c == '_' || c == '-') | ||
.map(|item| self.mapping.get(item).map(Deref::deref).unwrap_or(item)) | ||
.collect(); | ||
vec.join("_") | ||
} | ||
|
||
#[inline] | ||
pub fn minify_description(&self, description: &str) -> String { | ||
description | ||
.split_word_bounds() | ||
.map(|item| self.mapping.get(item).map(Deref::deref).unwrap_or(item)) | ||
.collect() | ||
} | ||
|
||
#[inline] | ||
pub fn minify_js(json: &str) -> String { | ||
let tokens: Tokens = simple_minify(json) | ||
.into_iter() | ||
.map(|token| match token { | ||
Token::Keyword(Keyword::Null) => Token::Other("N"), | ||
_ => token, | ||
}) | ||
.collect::<Vec<_>>() | ||
.into(); | ||
aggregate_strings_into_array_filter(tokens, "C", |tokens, position| { | ||
// Ignore the key of json (AKA, the crate id). | ||
position > 5 && !tokens[position + 1].eq_char(ReservedChar::Colon) | ||
}) | ||
.to_string() | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
use argh::FromArgs; | ||
use rayon::prelude::*; | ||
use rustsec::{database::Query, Collection, Database}; | ||
use std::{collections::HashMap, fs, io::Write, path::Path}; | ||
|
||
use super::Task; | ||
|
||
const ADVISORY_INDEX_PATH: &str = "../docs/static/advisory"; | ||
|
||
/// Advisory task | ||
#[derive(FromArgs)] | ||
#[argh(subcommand, name = "advisory")] | ||
pub struct AdvisoryTask { | ||
/// destination path | ||
#[argh(option, short = 'd', default = "ADVISORY_INDEX_PATH.to_string()")] | ||
dest_path: String, | ||
} | ||
|
||
impl Task for AdvisoryTask { | ||
fn execute(&self) -> crate::Result<()> { | ||
let mut map = HashMap::new(); | ||
let db = Database::fetch()?; | ||
for advisory in db | ||
.query(&Query::new().collection(Collection::Crates).withdrawn(false)) | ||
.into_iter() | ||
{ | ||
map.entry(&advisory.metadata.package) | ||
.or_insert_with(Vec::new) | ||
.push(advisory); | ||
} | ||
|
||
let path = Path::new(&self.dest_path); | ||
if !path.exists() { | ||
fs::create_dir(path)?; | ||
} | ||
map.par_iter_mut().for_each(|(package, advisories)| {≥ | ||
// sort advisories by date | ||
advisories.sort_by(|a, b| b.metadata.date.cmp(&a.metadata.date)); | ||
let package = package.as_str().replace('-', "_"); | ||
let mut file = fs::File::create(path.join(format!("{package}.json"))).unwrap(); | ||
let json = serde_json::to_string_pretty(&advisories).unwrap(); | ||
file.write_all(json.as_bytes()).unwrap(); | ||
}); | ||
Ok(()) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
use std::fs; | ||
use std::path::Path; | ||
|
||
use argh::FromArgs; | ||
use futures::future::try_join_all; | ||
use regex::Regex; | ||
use select::document::Document; | ||
use select::node::Node; | ||
use select::predicate::{Class, Name}; | ||
use serde::ser::SerializeTuple; | ||
use serde::{Serialize, Serializer}; | ||
use tokio::runtime::Runtime; | ||
|
||
use crate::minify::Minifier; | ||
use crate::tasks::Task; | ||
|
||
const BOOKS_INDEX_PATH: &str = "../lib/index/books.js"; | ||
const COMMANDS: &str = include_str!("../../../lib/index/commands.js"); | ||
|
||
/// Books task | ||
#[derive(FromArgs)] | ||
#[argh(subcommand, name = "books")] | ||
pub struct BooksTask { | ||
/// destination path | ||
#[argh(option, short = 'd', default = "BOOKS_INDEX_PATH.to_string()")] | ||
dest_path: String, | ||
} | ||
|
||
#[derive(Debug)] | ||
struct Page { | ||
title: String, | ||
path: String, | ||
parent_titles: Option<Vec<String>>, | ||
} | ||
|
||
#[derive(Serialize, Debug, Default)] | ||
struct Book<'a> { | ||
name: &'a str, | ||
url: &'a str, | ||
#[serde(skip_deserializing)] | ||
pages: Vec<Page>, | ||
} | ||
|
||
impl<'a> Book<'a> { | ||
fn is_empty(&self) -> bool { | ||
self.name.is_empty() || self.url.is_empty() | ||
} | ||
} | ||
|
||
impl Page { | ||
#[inline] | ||
fn parse(node: &Node) -> Option<Page> { | ||
if let Some(a) = node.first_child().filter(|n| n.is(Name("a"))) { | ||
let title = a.text(); | ||
let path = a | ||
.attr("href") | ||
.unwrap() | ||
.trim_end_matches(".html") | ||
.to_string(); | ||
|
||
Some(Page { | ||
title, | ||
path, | ||
parent_titles: None, | ||
}) | ||
} else { | ||
None | ||
} | ||
} | ||
} | ||
|
||
impl Serialize for Page { | ||
#[inline] | ||
fn serialize<S>(&self, serializer: S) -> Result<<S as Serializer>::Ok, <S as Serializer>::Error> | ||
where | ||
S: Serializer, | ||
{ | ||
let mut ser = serializer.serialize_tuple(3)?; | ||
ser.serialize_element(&self.title)?; | ||
ser.serialize_element(&self.path)?; | ||
ser.serialize_element(&self.parent_titles)?; | ||
ser.end() | ||
} | ||
} | ||
|
||
#[inline] | ||
fn parse_node(node: &Node, parent_titles: Option<Vec<String>>) -> Vec<Page> { | ||
let mut pages = vec![]; | ||
for child in node.children() { | ||
if child.is(Class("expanded")) || child.first_child().filter(|n| n.is(Name("a"))).is_some() | ||
{ | ||
if let Some(mut page) = Page::parse(&child) { | ||
page.parent_titles = parent_titles.clone(); | ||
pages.push(page); | ||
} | ||
} else { | ||
let mut new_parent_titles = parent_titles.clone().unwrap_or_default(); | ||
if let Some(page) = child.prev().and_then(|n| Page::parse(&n)) { | ||
new_parent_titles.push(page.title); | ||
if let Some(section) = child.find(Class("section")).next() { | ||
pages.extend(parse_node(§ion, Some(new_parent_titles))) | ||
} | ||
} | ||
} | ||
} | ||
pages | ||
} | ||
|
||
async fn fetch_book(mut book: Book<'_>) -> crate::Result<Book<'_>> { | ||
let html = reqwest::get(book.url).await?.text().await?; | ||
let doc = Document::from(html.as_str()); | ||
if let Some(node) = doc.find(Class("chapter")).next() { | ||
book.pages = parse_node(&node, None); | ||
Ok(book) | ||
} else { | ||
println!("Parse failed, book `{}` is ignored.", book.name); | ||
Ok(Book::default()) | ||
} | ||
} | ||
|
||
impl Task for BooksTask { | ||
fn execute(&self) -> crate::Result<()> { | ||
let rt = Runtime::new()?; | ||
rt.block_on(self.run())?; | ||
Ok(()) | ||
} | ||
} | ||
|
||
impl BooksTask { | ||
async fn run(&self) -> crate::Result<()> { | ||
let re = Regex::new(r#"^\["(.*)",\s?"(.*)"\]"#).unwrap(); | ||
let mut books = vec![]; | ||
let mut started = false; | ||
for line in COMMANDS.lines() { | ||
if line.trim().starts_with("\"book\"") { | ||
started = true; | ||
} else if line.trim().starts_with("\"book/zh\"") { | ||
break; | ||
} | ||
|
||
if started { | ||
if let Some(capture) = re.captures(line.trim()) { | ||
let book = Book { | ||
name: capture.get(1).unwrap().as_str(), | ||
url: capture.get(2).unwrap().as_str(), | ||
pages: Vec::default(), | ||
}; | ||
books.push(book); | ||
} | ||
} | ||
} | ||
println!("{:?}", books); | ||
let futures: Vec<_> = books.into_iter().map(fetch_book).collect(); | ||
match try_join_all(futures).await { | ||
Ok(result) => { | ||
let books: Vec<_> = result.into_iter().filter(|book| !book.is_empty()).collect(); | ||
let contents = format!( | ||
"var N=null;const booksIndex={};export default booksIndex;", | ||
serde_json::to_string(&books)? | ||
); | ||
let path = Path::new(&self.dest_path); | ||
fs::write(path, Minifier::minify_js(&contents)).unwrap(); | ||
} | ||
Err(error) => { | ||
println!("{:?}", error); | ||
} | ||
} | ||
|
||
Ok(()) | ||
} | ||
} |
Oops, something went wrong.