holle-list-barrier-islands/merge-to-nbl-lekon.R at main · complexico/holle-list-barrier-islands · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# The code to join Lekon list with New Basic List (NBL)
# DOI for digitised NBL: https://doi.org/10.25446/oxford.23205173

library(tidyverse)
library(readxl)

source("merge-NBL-call.R") # run codes to retrieve the NBL and NBL's Concepticon mapping
source("gloss-English-and-Indonesian-added-to-unsp-Dutch.R")

lekon_words <- read_lines("plaintexts/lekon.txt")

# Processing the word list =====
wlist_tags <- str_which(lekon_words, "wordlist\\>")
lekon_wlist <- lekon_words[(wlist_tags[1]+1):(wlist_tags[2]-1)]
lekon_words_df <- tibble(lekon = lekon_wlist)
lekon_words_df <- lekon_words_df |>
  mutate(Index = str_extract(lekon, "^[^ ]+(?=\\.\\s)"),
         Forms = str_replace_all(lekon, "(^[^ ]+\\.\\s|\\s?\\<[^>]+\\>)", ""),
         Notes_id = str_extract_all(lekon, "(?<=\\<)([^>]+?)(?=\\>)")) |>
  unnest_longer(Notes_id, keep_empty = TRUE) |>
  mutate(across(where(is.character), ~replace_na(., ""))) # |>
  # mutate(Forms = str_split(Forms, "\\,\\s?")) |>
  # unnest_longer(Forms)

# check which Index in Lekon is not in the Index of the NBL
id_lekon_not_in_nbl <- which(!lekon_words_df$Index %in% holle_tb$Index)
lekon_words_df$Index[id_lekon_not_in_nbl]
# [1] "1080/1081" (the correct id in NBL is "1080-1081")
## fix the index
lekon_words_df <- lekon_words_df |>
  mutate(Index = replace(Index, Index == lekon_words_df$Index[id_lekon_not_in_nbl], "1080-1081"))
## again check which Index in Lekon is not in the Index of the NBL
id_lekon_not_in_nbl <- which(!lekon_words_df$Index %in% holle_tb$Index)
lekon_words_df$Index[id_lekon_not_in_nbl]
# character(0)

# Processing the notes ====
notes_tags <- str_which(lekon_words, "\\<\\/?notes\\>")
lekon_notes <- lekon_words[(notes_tags[1]+1):(notes_tags[2]-1)]
# check the notes tag available
lekon_notes |> str_extract_all("\\<[^<\\/]+?\\>") |> unlist() |> unique()
# [1] "<n>"       "<eng>"     "<form>"    "<comment>" "<idn>"     "<xr>"      "<ptr>"     "<tapah>"
lekon_notes_df <- tibble(lekon_notes)
lekon_notes_df <- lekon_notes_df |>
  mutate(Notes_id = str_extract(lekon_notes, "^[^ ]+?(?=\\.\\s)")) |>
  mutate(notes = str_replace(lekon_notes, "^[^ ]+\\.\\s", "")) |>
  mutate(notes = str_split(notes, "\\s?\\;\\s")) |>
  unnest_longer(notes, keep_empty = TRUE) |>
  select(-lekon_notes) |>
  mutate(nt_form = str_extract(notes, "(?<=\\<form\\>)([^<]+?)(?=\\<\\/form\\>)"),
         nt_eng = str_extract(notes, "(?<=\\<eng\\>)([^<]+?)(?=\\<\\/eng\\>)"),
         nt_idn = str_extract(notes, "(?<=\\<idn\\>)([^<]+?)(?=\\<\\/idn\\>)"),
         nt_tapah = str_extract(notes, "(?<=\\<tapah\\>)([^<]+?)(?=\\<\\/tapah\\>)"),
         nt_tapah = str_replace(nt_tapah, "^T\\s", ""),
         nt_comment = str_extract(notes, "(?<=\\<comment\\>)([^<]+?)(?=\\<\\/comment\\>)"),
         nt_xref = str_extract(notes, "(?<=\\<xr\\>)(.+?)(?=\\<\\/xr\\>)"),
         nt_xref = str_replace_all(nt_xref, "\\<\\/?ptr\\>", "")) |>
  select(-notes) |>
  mutate(across(where(is.character), ~replace_na(., "")))

# join the NBL =====
tb <- lekon_words_df |>
  left_join(holle_tb)

# join word list and the notes =====
tb <- tb |>
  left_join(lekon_notes_df) |>
  mutate(across(where(is.character), ~replace_na(., "")))

# read the translated data & integrate with the main table ====
now_translated <- readxl::read_xlsx("data-raw/no-translation-now-translated.xlsx") |>
  rename(Index = id,
         English_add = English,
         Indonesian_add = Indonesian) |>
  select(-form,
         -Dutch)
no_english_id <- pull(filter(tb, English == ""), Index)
no_indonesian_id <- pull(filter(tb, Indonesian == ""), Index)
now_translated_eng <- now_translated |>
  filter(Index %in% no_english_id) |>
  select(-Indonesian_add)
now_translated_idn <- now_translated |>
  filter(Index %in% no_indonesian_id) |>
  select(-English_add)

# join the main database with the matching now-translated data ====
tb <- tb |>
  left_join(now_translated_eng) |>
  left_join(now_translated_idn) |>
  mutate(English = if_else(!is.na(English_add),
                           English_add,
                           English),
         Indonesian = if_else(!is.na(Indonesian_add),
                           Indonesian_add,
                           Indonesian)) |>
  select(-English_add, -Indonesian_add)

# Join Concepticon ====
tb <- tb |>
  left_join(concepticon_checked) |>
  mutate(across(where(is.character), ~replace_na(., "")))

# Matching notes and forms for multiple forms and split forms in notes ====
## Highly customised, on a case-by-case basis!
tb <- tb |>
  mutate(Forms = if_else(str_detect(nt_form, "^inang.+tiri"),
                         str_replace(Forms, "^among.+?(?=inang)", ""),
                         Forms),
         Forms = if_else(str_detect(nt_form, "^amang.+tiri"),
                         str_replace(Forms, "\\,\\sinang.+tiri", ""),
                         Forms),
         Forms = if_else(str_detect(nt_form, "bolêm ònding"),
                         str_replace(Forms, "bòlêm itêm\\,\\s", ""),
                         Forms),
         Forms = if_else(str_detect(nt_form, "bolêm itêm"),
                         str_replace(Forms, ",\\sbòlêm ònding", ""),
                         Forms)) |>
  # split multiple forms in Forms and nt_form
  separate_longer_delim(Forms, "/") |>
  mutate(Forms = str_split(Forms, ", ")) |>
  unnest_longer(col = Forms) |>
  mutate(nt_form = str_split(nt_form, ", ")) |>
  unnest_longer(col = nt_form) |>
  # add from note form the main Form that originally is empty/given note ID only
  ## the FormsAll column stores all forms (from the main list and the note form)
  mutate(FormsAll = if_else(Forms == "" & nt_form != "",
                            nt_form,
                            Forms)) |>
  distinct() |>
  left_join(nogloss3) |> # add the gloss for \ and - IDs
  mutate(English = if_else(!is.na(English3),
                           English3,
                           English)) |>
  mutate(Indonesian = if_else(!is.na(Indonesian3),
                              Indonesian3,
                              Indonesian)) |>
  select(-English3, -Indonesian3) |>
  # make FormsAll into Forms and change Forms into FormsOrig, which store the original forms in the list that can contain empty forms due to reference to the notes.
  rename(FormsOrig = Forms,
         Forms = FormsAll) |>
  relocate(Forms, .after = Index) |>
  relocate(FormsOrig, .after = concept_url) |>
  distinct() |>
  # add codes to determine the types of the list based on the Index
  mutate(list_type = "NBL",
         list_type = if_else(Index %in% holle_1904_1911$Index,
                             "added_list_1904_1911",
                             list_type),
         list_type = if_else(Index %in% holle_1931$Index,
                             "added_list_1931",
                             list_type),
         list_type = if_else(str_detect(Index, "^add_"),
                             "added_data",
                             list_type))

write_tsv(tb, "data-output/lekon_tb.tsv")