Skip to content

Commit cdc0f7b

Browse files
committed
Add otus 18 and async scrapper
1 parent e8f664f commit cdc0f7b

File tree

8 files changed

+328
-0
lines changed

8 files changed

+328
-0
lines changed

async-web-scrapper/project.clj

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
(defproject web-scraper "0.1.0-SNAPSHOT"
2+
:description "Simple WEB scrapper example"
3+
4+
:dependencies [[org.clojure/clojure "1.11.1"]
5+
[org.clojure/core.async "1.6.673"]
6+
[clj-http "3.12.3"]
7+
[org.jsoup/jsoup "1.16.1"]]
8+
9+
:repl-options {:init-ns async-web-scraper.core})
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
(ns async-web-scraper.core
2+
(:require [clj-http.client :as http]
3+
[clojure.core.async
4+
:as async
5+
:refer [<! <!! >! chan close! go pipeline pipeline-async thread]]
6+
[clojure.string :as str])
7+
(:import (org.jsoup Jsoup)))
8+
9+
(defn async-request [opts]
10+
(thread (http/request opts)))
11+
12+
(defn request-page [base-url href]
13+
(go (-> (<! (async-request {:url (str base-url href)
14+
:method :get}))
15+
:body
16+
(Jsoup/parse))))
17+
18+
(defn extract-pagination-hrefs [page]
19+
(let [external? (fn [ref] (str/includes? ref "https://"))]
20+
(-> page
21+
(.select "div[class^=paginator] > a[class^=link]")
22+
(->> (map #(.attr % "href"))
23+
(remove external?)))))
24+
25+
(defn extract-articles-hrefs [page]
26+
(let [external? (fn [ref] (str/includes? ref "https://"))]
27+
(-> page
28+
(.select "div[class^=card] a[class^=link]")
29+
(->> (map #(.attr % "href"))
30+
(remove external?)))))
31+
32+
(defn extract-data [page]
33+
(let [author (-> page
34+
(.select "div[class^=coAuthor] a p")
35+
(.text))
36+
title (-> (.select page "title")
37+
.text)
38+
text (-> page
39+
(.select "div[class^=articleView] p")
40+
(->> (map #(.text %))
41+
(str/join "\n")))
42+
words-count (count (str/split text #"\s+"))]
43+
{:title title
44+
:author author
45+
;; :text text
46+
:words-count words-count}))
47+
48+
(defn async-scraping [base-url & {:keys [limit-pages entrypoint]
49+
:or {limit-pages 5}}]
50+
(let [flow-href-c (chan)
51+
52+
article-href-c (chan 20)
53+
article-page-c (chan 20)
54+
55+
out-c (chan 100)]
56+
57+
;; request journal pages and extract hrefs to articles from them
58+
(pipeline-async 1 article-href-c
59+
(fn [href result]
60+
(go
61+
(let [flow-page (<! (request-page base-url href))
62+
articles-hrefs (extract-articles-hrefs flow-page)]
63+
(doseq [article-href articles-hrefs]
64+
(>! result article-href))
65+
(close! result))))
66+
flow-href-c)
67+
68+
;; request article pages by their hrefs
69+
(pipeline-async 1 article-page-c
70+
(fn [href result]
71+
(go (->> (<! (request-page base-url href))
72+
(>! result))
73+
(close! result)))
74+
article-href-c)
75+
76+
;; extract data from articles pages
77+
(pipeline 4 out-c (map extract-data) article-page-c)
78+
79+
(go (let [entry-page (<! (request-page base-url entrypoint))]
80+
(doseq [href (take limit-pages (extract-pagination-hrefs entry-page))]
81+
(>! flow-href-c href))
82+
(close! flow-href-c)))
83+
84+
out-c))
85+
86+
(comment
87+
(def flow-page
88+
(<!! (request-page "https://journal.tinkoff.ru"
89+
"/flows/readers-travel/")))
90+
91+
(def article-page
92+
(<!! (request-page "https://journal.tinkoff.ru"
93+
"/roadtrip-usa-hidden-cost/")))
94+
95+
(extract-pagination-hrefs flow-page)
96+
(extract-articles-hrefs flow-page)
97+
;; => ("/13-days-in-turkey/"
98+
;; "/travel-with-kids-to-krasnoyarsk/"
99+
;; ...
100+
;; "/roadtrip-usa-hidden-cost/")
101+
102+
(extract-data article-page)
103+
104+
(def ch (async-scraping "https://journal.tinkoff.ru"
105+
:entrypoint "/flows/readers-travel/"
106+
:limit-pages 5))
107+
108+
(<!! ch)
109+
110+
(count (<!! (async/into [] ch)))
111+
112+
;; => 114
113+
)
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
(ns web-scraper.core-test
2+
(:require [clojure.test :refer :all]
3+
[web-scrapper.core :refer :all]))
4+

otus-18/README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
### Урок #18
2+
3+
core.async
4+
5+
https://github.com/clojure/core.async
6+
7+
API Docs
8+
9+
https://clojure.github.io/core.async/
10+
11+
### ДЗ
12+
#### Научиться работать с https://pokeapi.co/ в асинхронном стиле.
13+
14+
- Научиться получать список покемонов — это отправная точка для сбора информации.
15+
- Получить для каждого покемона, упомянутого в ответе на первый запрос, его имя, наименования типов на заданном языке (их может быть несколько). Эти данные можно получить асинхронно, но результата следует подождать и сохранить оный в подходящие структуры данных.
16+
- Типы покемонов — справочные данные. Следует озаботиться получением полного перечня типов с их наименованиями, чтобы затем только обращаться к сохранённому справочнику при добавлении наименований типов к данным покемона.

otus-18/project.clj

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
(defproject otus-18 "0.1.0-SNAPSHOT"
2+
:description "FIXME: write description"
3+
:url "http://example.com/FIXME"
4+
5+
:dependencies [[org.clojure/clojure "1.11.1"]
6+
[org.clojure/core.async "1.6.673"]
7+
[clj-http "3.12.3"]
8+
[clj-http-fake "1.0.4"]
9+
[cheshire "5.11.0"]])

otus-18/src/otus_18/async.clj

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
(ns otus-18.async
2+
(:require [clojure.core.async :as a :refer [<! <!! >! >!!
3+
alts! chan close!
4+
go go-loop
5+
pipeline-async pipeline
6+
thread timeout]]))
7+
8+
;; Всем ли хорошо видно?
9+
10+
;; * Каналы (Channels)
11+
12+
(def echo-chan (chan))
13+
(go (println (<! echo-chan)))
14+
(>!! echo-chan "bottle")
15+
; => true
16+
; => bottle
17+
18+
;; * Буферизация (Buffers)
19+
20+
(def echo-buffer (chan 2))
21+
(>!! echo-buffer :hello)
22+
; => true
23+
(>!! echo-buffer :world)
24+
; => true
25+
(>!! echo-buffer :!)
26+
27+
; Третий вызов будет блокирующим, т.к. буфер переполнен
28+
29+
;; Note: `sliding-buffer` - при переполнении старые данные будут удаляться из буфера
30+
;; `dropping-buffer` - новые данные не будут попадать в переполненный буфер
31+
;; при их использовании `>!!` никогда не заблокирует поток выполнения
32+
33+
;; * Блокирование и Паркинг (Blocking and Parking)
34+
35+
;; `>!`, `<!` - могут быть использованы только внутри `go`-блока
36+
;; `>!!`, `<!!` - могут быть использованы как в `go`-блоках, так и в основном коде
37+
38+
(def n-ch (chan))
39+
(doseq [n (range 1000)]
40+
(go (>! n-ch n)))
41+
42+
;; `>!`, `<!` - park thread
43+
;; `>!!`, `<!!` - block thread
44+
45+
;; * thread
46+
47+
(def ch (chan))
48+
49+
(time (do
50+
(doseq [n (range 9)]
51+
(go
52+
(Thread/sleep 1000)
53+
(>! ch n)))
54+
(doseq [_ (range 9)]
55+
(<!! ch))))
56+
57+
; => "Elapsed time: 2016.62955 msecs"
58+
59+
(def another-echo-chan (chan))
60+
(thread (println (<!! another-echo-chan)))
61+
(>!! another-echo-chan "hi")
62+
; => true
63+
; => hi
64+
65+
;; * alts!
66+
67+
(def ch-a (chan))
68+
(def ch-b (chan))
69+
(go (println (alts! [ch-a ch-b])))
70+
(>!! ch-b "bottle")
71+
72+
(go (println (alts! [ch-a ch-b (timeout 1000)])))
73+
74+
;; * pipeline
75+
76+
77+
(def capitalizer (map clojure.string/capitalize))
78+
79+
(def input (chan))
80+
(def output (chan))
81+
82+
(go-loop []
83+
(when-let [x (<! output)]
84+
(println x)
85+
(recur)))
86+
87+
(pipeline 1 output capitalizer input)
88+
89+
(>!! input "hello")
90+
91+
(close! input)
92+
93+
(<!! output)
94+
95+
;; * pipeline-async
96+
97+
(def ca> (chan 1))
98+
(def cb> (chan 1))
99+
100+
(defn c-af [val result] ; notice the signature is different for `pipeline-async`, it includes a channel
101+
(go (<! (timeout 1000))
102+
(>! result (str val "!!!"))
103+
(>! result (str val "!!!"))
104+
(>! result (str val "!!!"))
105+
(close! result)))
106+
107+
(pipeline-async 1 cb> c-af ca>)
108+
109+
(go (println (<! cb>)))
110+
(>!! ca> "hello")
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
(ns otus-18.homework.pokemons)
2+
3+
(def base-url "https://pokeapi.co/api/v2")
4+
(def pokemons-url (str base-url "/pokemon"))
5+
(def type-path (str base-url "/type"))
6+
7+
(defn extract-pokemon-name [pokemon]
8+
(:name pokemon))
9+
10+
(defn extract-type-name [pokemon-type lang]
11+
(->> (:names pokemon-type)
12+
(filter (fn [type-name] (= lang (-> type-name :language :name))))
13+
(first)
14+
:name))
15+
16+
(defn get-pokemons
17+
"Асинхронно запрашивает список покемонов и название типов в заданном языке. Возвращает map, где ключами являются
18+
имена покемонов (на английском английский), а значения - коллекция названий типов на заданном языке."
19+
[& {:keys [limit lang] :or {limit 50 lang "ja"}}])
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
(ns otus-18.homework.pokemons-test
2+
(:require
3+
[cheshire.core :as cheshire]
4+
[otus-18.homework.pokemons :as subject]
5+
[clj-http.fake :refer [with-global-fake-routes-in-isolation]]
6+
[clojure.test :refer [deftest is]]))
7+
8+
(deftest get-pokemons-test
9+
(with-global-fake-routes-in-isolation
10+
{{:address "https://pokeapi.co/api/v2/pokemon" :query-params {:limit 50 :lang "ja"}}
11+
(fn [_request]
12+
{:status 200
13+
:body (cheshire/generate-string
14+
{:results
15+
[{:name "pikachu"
16+
:url "https://pokeapi.co/api/v2/pokemon/1/"}]})})
17+
{:address "https://pokeapi.co/api/v2/pokemon/pikachu" :query-params {:lang "ja"}}
18+
(fn [_request]
19+
{:status 200
20+
:body (cheshire/generate-string
21+
{:name "pikachu"
22+
:types [{:slot 1
23+
:type {:name "electric",
24+
:url "https://pokeapi.co/api/v2/type/13/"}}]})})
25+
{:address "https://pokeapi.co/api/v2/pokemon/1/" :query-params {:lang "ja"}}
26+
(fn [_request]
27+
{:status 200
28+
:body (cheshire/generate-string
29+
{:name "pikachu"
30+
:types [{:slot 1
31+
:type {:name "electric",
32+
:url "https://pokeapi.co/api/v2/type/13/"}}]})})
33+
"https://pokeapi.co/api/v2/type/electric"
34+
(fn [_request]
35+
{:status 200
36+
:body (cheshire/generate-string
37+
{:names [{:language {:name "ja",
38+
:url "https://pokeapi.co/api/v2/language/11/"},
39+
:name "でんき"}]})})
40+
"https://pokeapi.co/api/v2/type/13/"
41+
(fn [_request]
42+
{:status 200
43+
:body (cheshire/generate-string
44+
{:names {:language {:name "ja",
45+
:url "https://pokeapi.co/api/v2/language/11/"}}})})}
46+
47+
(is (= {"pikachu" ["でんき"]}
48+
(subject/get-pokemons :lang "ja")))))

0 commit comments

Comments
 (0)