Skip to content

Commit 1e9753b

Browse files
committed
Add rust tasks
1 parent 757a364 commit 1e9753b

File tree

14 files changed

+1183
-0
lines changed

14 files changed

+1183
-0
lines changed

rust/Cargo.toml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
[package]
2+
name = "rust-search-extension"
3+
version = "0.1.0"
4+
authors = ["Folyd <[email protected]>"]
5+
edition = "2021"
6+
publish = false
7+
8+
[dependencies]
9+
serde = { version = "1", features = ["derive"] }
10+
serde_json = "1"
11+
minifier = "0.3"
12+
unicode-segmentation = "1"
13+
futures = "0.3"
14+
tokio = { version = "1", features = ["macros", "time", "rt-multi-thread"] }
15+
reqwest = { version = "0.11", features = ["json"] }
16+
select = "0.6"
17+
semver = { version = "1", features = ["serde"] }
18+
rayon = "1"
19+
regex = "1"
20+
argh = "0.1"
21+
rustsec = "0"
22+
html-escape = "0"
23+
db-dump = "0.7.1"

rust/src/frequency.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#[derive(Debug)]
2+
pub struct FrequencyWord {
3+
pub word: String,
4+
pub frequency: usize,
5+
}
6+
7+
impl FrequencyWord {
8+
#[inline]
9+
pub fn score(&self) -> usize {
10+
// Due to the prefix + suffix occupying two letters,
11+
// we should minus the length to calculate the score.
12+
// This will lead to a 0.4% reduction in file size.
13+
(self.word.len() - 2) * self.frequency
14+
}
15+
}

rust/src/main.rs

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
use std::error::Error;
2+
3+
use argh::FromArgs;
4+
5+
use crate::tasks::*;
6+
7+
mod frequency;
8+
mod minify;
9+
mod tasks;
10+
11+
/// Options
12+
#[derive(FromArgs)]
13+
struct Options {
14+
#[argh(subcommand)]
15+
subcommand: Subcommand,
16+
}
17+
18+
/// Subcommands
19+
#[derive(FromArgs)]
20+
#[argh(subcommand)]
21+
#[non_exhaustive]
22+
enum Subcommand {
23+
Advisory(AdvisoryTask),
24+
Crates(CratesTask),
25+
Books(BooksTask),
26+
Caniuse(CaniuseTask),
27+
Lints(LintsTask),
28+
Labels(LabelsTask),
29+
Rfcs(RfcsTask),
30+
Rustc(RustcTask),
31+
Targets(TargetsTask),
32+
}
33+
34+
pub type Result<T> = std::result::Result<T, Box<dyn Error>>;
35+
36+
fn main() -> Result<()> {
37+
let options: Options = argh::from_env();
38+
match options.subcommand {
39+
Subcommand::Advisory(cmd) => cmd.execute()?,
40+
Subcommand::Crates(cmd) => cmd.execute()?,
41+
Subcommand::Books(cmd) => cmd.execute()?,
42+
Subcommand::Caniuse(cmd) => cmd.execute()?,
43+
Subcommand::Lints(cmd) => cmd.execute()?,
44+
Subcommand::Labels(cmd) => cmd.execute()?,
45+
Subcommand::Rfcs(cmd) => cmd.execute()?,
46+
Subcommand::Rustc(cmd) => cmd.execute()?,
47+
Subcommand::Targets(cmd) => cmd.execute()?,
48+
}
49+
Ok(())
50+
}

rust/src/minify.rs

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
use minifier::js::{
2+
aggregate_strings_into_array_filter, simple_minify, Keyword, ReservedChar, Token, Tokens,
3+
};
4+
use rayon::prelude::*;
5+
use std::collections::HashMap;
6+
use std::ops::Deref;
7+
use unicode_segmentation::UnicodeSegmentation;
8+
9+
use crate::frequency::FrequencyWord;
10+
11+
#[derive(Debug)]
12+
pub struct Minifier<'a> {
13+
// A word to keys mapping. Such as <"cargo", "$0">.
14+
mapping: HashMap<&'a str, String>,
15+
}
16+
17+
impl<'a> Minifier<'a> {
18+
const PREFIX: &'static str = "@$^&";
19+
const SUFFIX: &'static str = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
20+
21+
pub fn new(frequency_words: &'a [FrequencyWord]) -> Minifier<'a> {
22+
let keys: Vec<String> = Self::PREFIX
23+
.chars()
24+
.flat_map(|prefix| {
25+
Self::SUFFIX
26+
.chars()
27+
.map(|suffix| format!("{prefix}{suffix}"))
28+
.collect::<Vec<String>>()
29+
})
30+
.collect();
31+
32+
let words = frequency_words
33+
.into_par_iter()
34+
.take(keys.len())
35+
.collect::<Vec<_>>();
36+
37+
Minifier {
38+
mapping: words
39+
.into_par_iter()
40+
.enumerate()
41+
.map(|(index, fw)| (fw.word.as_str(), keys.get(index).unwrap().to_owned()))
42+
.collect(),
43+
}
44+
}
45+
46+
// Get the key to word mapping to help Javascript to decode the minified string.
47+
pub fn get_key_to_word_mapping(&self) -> HashMap<String, String> {
48+
self.mapping
49+
.iter()
50+
.map(|(key, value)| (value.to_owned(), (*key).to_owned()))
51+
.collect()
52+
}
53+
54+
#[inline]
55+
pub fn minify_crate_name(&self, name: &str) -> String {
56+
let vec: Vec<&str> = name
57+
.split(|c| c == '_' || c == '-')
58+
.map(|item| self.mapping.get(item).map(Deref::deref).unwrap_or(item))
59+
.collect();
60+
vec.join("_")
61+
}
62+
63+
#[inline]
64+
pub fn minify_description(&self, description: &str) -> String {
65+
description
66+
.split_word_bounds()
67+
.map(|item| self.mapping.get(item).map(Deref::deref).unwrap_or(item))
68+
.collect()
69+
}
70+
71+
#[inline]
72+
pub fn minify_js(json: &str) -> String {
73+
let tokens: Tokens = simple_minify(json)
74+
.into_iter()
75+
.map(|token| match token {
76+
Token::Keyword(Keyword::Null) => Token::Other("N"),
77+
_ => token,
78+
})
79+
.collect::<Vec<_>>()
80+
.into();
81+
aggregate_strings_into_array_filter(tokens, "C", |tokens, position| {
82+
// Ignore the key of json (AKA, the crate id).
83+
position > 5 && !tokens[position + 1].eq_char(ReservedChar::Colon)
84+
})
85+
.to_string()
86+
}
87+
}

rust/src/tasks/advisory.rs

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
use argh::FromArgs;
2+
use rayon::prelude::*;
3+
use rustsec::{database::Query, Collection, Database};
4+
use std::{collections::HashMap, fs, io::Write, path::Path};
5+
6+
use super::Task;
7+
8+
const ADVISORY_INDEX_PATH: &str = "../docs/static/advisory";
9+
10+
/// Advisory task
11+
#[derive(FromArgs)]
12+
#[argh(subcommand, name = "advisory")]
13+
pub struct AdvisoryTask {
14+
/// destination path
15+
#[argh(option, short = 'd', default = "ADVISORY_INDEX_PATH.to_string()")]
16+
dest_path: String,
17+
}
18+
19+
impl Task for AdvisoryTask {
20+
fn execute(&self) -> crate::Result<()> {
21+
let mut map = HashMap::new();
22+
let db = Database::fetch()?;
23+
for advisory in db
24+
.query(&Query::new().collection(Collection::Crates).withdrawn(false))
25+
.into_iter()
26+
{
27+
map.entry(&advisory.metadata.package)
28+
.or_insert_with(Vec::new)
29+
.push(advisory);
30+
}
31+
32+
let path = Path::new(&self.dest_path);
33+
if !path.exists() {
34+
fs::create_dir(path)?;
35+
}
36+
map.par_iter_mut().for_each(|(package, advisories)| {
37+
// sort advisories by date
38+
advisories.sort_by(|a, b| b.metadata.date.cmp(&a.metadata.date));
39+
let package = package.as_str().replace('-', "_");
40+
let mut file = fs::File::create(path.join(format!("{package}.json"))).unwrap();
41+
let json = serde_json::to_string_pretty(&advisories).unwrap();
42+
file.write_all(json.as_bytes()).unwrap();
43+
});
44+
Ok(())
45+
}
46+
}

rust/src/tasks/books.rs

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
use std::fs;
2+
use std::path::Path;
3+
4+
use argh::FromArgs;
5+
use futures::future::try_join_all;
6+
use regex::Regex;
7+
use select::document::Document;
8+
use select::node::Node;
9+
use select::predicate::{Class, Name};
10+
use serde::ser::SerializeTuple;
11+
use serde::{Serialize, Serializer};
12+
use tokio::runtime::Runtime;
13+
14+
use crate::minify::Minifier;
15+
use crate::tasks::Task;
16+
17+
const BOOKS_INDEX_PATH: &str = "../lib/index/books.js";
18+
const COMMANDS: &str = include_str!("../../../lib/index/commands.js");
19+
20+
/// Books task
21+
#[derive(FromArgs)]
22+
#[argh(subcommand, name = "books")]
23+
pub struct BooksTask {
24+
/// destination path
25+
#[argh(option, short = 'd', default = "BOOKS_INDEX_PATH.to_string()")]
26+
dest_path: String,
27+
}
28+
29+
#[derive(Debug)]
30+
struct Page {
31+
title: String,
32+
path: String,
33+
parent_titles: Option<Vec<String>>,
34+
}
35+
36+
#[derive(Serialize, Debug, Default)]
37+
struct Book<'a> {
38+
name: &'a str,
39+
url: &'a str,
40+
#[serde(skip_deserializing)]
41+
pages: Vec<Page>,
42+
}
43+
44+
impl<'a> Book<'a> {
45+
fn is_empty(&self) -> bool {
46+
self.name.is_empty() || self.url.is_empty()
47+
}
48+
}
49+
50+
impl Page {
51+
#[inline]
52+
fn parse(node: &Node) -> Option<Page> {
53+
if let Some(a) = node.first_child().filter(|n| n.is(Name("a"))) {
54+
let title = a.text();
55+
let path = a
56+
.attr("href")
57+
.unwrap()
58+
.trim_end_matches(".html")
59+
.to_string();
60+
61+
Some(Page {
62+
title,
63+
path,
64+
parent_titles: None,
65+
})
66+
} else {
67+
None
68+
}
69+
}
70+
}
71+
72+
impl Serialize for Page {
73+
#[inline]
74+
fn serialize<S>(&self, serializer: S) -> Result<<S as Serializer>::Ok, <S as Serializer>::Error>
75+
where
76+
S: Serializer,
77+
{
78+
let mut ser = serializer.serialize_tuple(3)?;
79+
ser.serialize_element(&self.title)?;
80+
ser.serialize_element(&self.path)?;
81+
ser.serialize_element(&self.parent_titles)?;
82+
ser.end()
83+
}
84+
}
85+
86+
#[inline]
87+
fn parse_node(node: &Node, parent_titles: Option<Vec<String>>) -> Vec<Page> {
88+
let mut pages = vec![];
89+
for child in node.children() {
90+
if child.is(Class("expanded")) || child.first_child().filter(|n| n.is(Name("a"))).is_some()
91+
{
92+
if let Some(mut page) = Page::parse(&child) {
93+
page.parent_titles = parent_titles.clone();
94+
pages.push(page);
95+
}
96+
} else {
97+
let mut new_parent_titles = parent_titles.clone().unwrap_or_default();
98+
if let Some(page) = child.prev().and_then(|n| Page::parse(&n)) {
99+
new_parent_titles.push(page.title);
100+
if let Some(section) = child.find(Class("section")).next() {
101+
pages.extend(parse_node(&section, Some(new_parent_titles)))
102+
}
103+
}
104+
}
105+
}
106+
pages
107+
}
108+
109+
async fn fetch_book(mut book: Book<'_>) -> crate::Result<Book<'_>> {
110+
let html = reqwest::get(book.url).await?.text().await?;
111+
let doc = Document::from(html.as_str());
112+
if let Some(node) = doc.find(Class("chapter")).next() {
113+
book.pages = parse_node(&node, None);
114+
Ok(book)
115+
} else {
116+
println!("Parse failed, book `{}` is ignored.", book.name);
117+
Ok(Book::default())
118+
}
119+
}
120+
121+
impl Task for BooksTask {
122+
fn execute(&self) -> crate::Result<()> {
123+
let rt = Runtime::new()?;
124+
rt.block_on(self.run())?;
125+
Ok(())
126+
}
127+
}
128+
129+
impl BooksTask {
130+
async fn run(&self) -> crate::Result<()> {
131+
let re = Regex::new(r#"^\["(.*)",\s?"(.*)"\]"#).unwrap();
132+
let mut books = vec![];
133+
let mut started = false;
134+
for line in COMMANDS.lines() {
135+
if line.trim().starts_with("\"book\"") {
136+
started = true;
137+
} else if line.trim().starts_with("\"book/zh\"") {
138+
break;
139+
}
140+
141+
if started {
142+
if let Some(capture) = re.captures(line.trim()) {
143+
let book = Book {
144+
name: capture.get(1).unwrap().as_str(),
145+
url: capture.get(2).unwrap().as_str(),
146+
pages: Vec::default(),
147+
};
148+
books.push(book);
149+
}
150+
}
151+
}
152+
println!("{:?}", books);
153+
let futures: Vec<_> = books.into_iter().map(fetch_book).collect();
154+
match try_join_all(futures).await {
155+
Ok(result) => {
156+
let books: Vec<_> = result.into_iter().filter(|book| !book.is_empty()).collect();
157+
let contents = format!(
158+
"var N=null;const booksIndex={};export default booksIndex;",
159+
serde_json::to_string(&books)?
160+
);
161+
let path = Path::new(&self.dest_path);
162+
fs::write(path, Minifier::minify_js(&contents)).unwrap();
163+
}
164+
Err(error) => {
165+
println!("{:?}", error);
166+
}
167+
}
168+
169+
Ok(())
170+
}
171+
}

0 commit comments

Comments
 (0)