-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01_get_merzmail.R
71 lines (50 loc) · 1.7 KB
/
01_get_merzmail.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# SCRIPT: 01_get_mm.R -------------------------------------------------------
#
# Author: Anton Könneke
#
# Save #MerzMail
#
# CREATED: 2023-06-03
#
# SETUP ------------------------------------------------------------------------
pacman::p_load(rvest, tidyverse)
get_mm <- function(baseurl = "https://www.friedrich-merz.de/merzmail/merzmail-",
index_from,
index_to,
wait = 3) {
entries <- seq(index_from, index_to)
urls <- sapply(entries, function(entries) {
paste0(baseurl, entries, "/")
})
x <- lapply(urls, function(y) {
print(y)
result <- list()
result$url <- y
# check if site is on
result$html_status <- httr::HEAD(y)$status_code
if (result$html_status == 200) {
html <- read_html(y)
result$fulltext <- html %>%
html_element(xpath = '/html/body/div[2]/div/section/div/div/div/div/div/div[4]/div') %>%
html_text()
result$date <- html %>%
html_element(xpath = "/html/body/div[2]/div/section/div/div/div/div/div/div[3]/div/ul/li/span") %>%
html_text2()
result$title <- html %>%
html_element(xpath = "/html/body/div[2]/div/section/div/div/div/div/div/div[1]/div/span") %>%
html_text()
} else {
result$fulltext <- NA
result$date <- NA
result$title <- NA
}
Sys.sleep(wait)
return(result)
})
return(x)
}
mm_scrape <- get_mm(index_from = 1, index_to = 152, wait = 1)
mm <- mm_scrape
tibble() %>%
unnest_wider(col = 1)
write.csv(mm, file = "texts/mm_06-2023.csv")