Skip to content

Commit 5ad3e8b

Browse files
committed
Fix #33
1 parent 36240a5 commit 5ad3e8b

File tree

1 file changed

+87
-98
lines changed

1 file changed

+87
-98
lines changed

src/belgium/2_create_db.Rmd

Lines changed: 87 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -97,85 +97,107 @@ And table name:
9797
table_name <- "occ_all"
9898
```
9999

100-
### Define storage class for each column
100+
### Define columns to read
101101

102-
The standard storage class is `TEXT`:
102+
We define a subset of columns, `cols_to_use`, we are interested to, and their type:
103103

104-
```{r def_default_cols_type}
105-
# default type: TEXT
106-
field_types <- rep("TEXT", length(cols_occ_file))
107-
names(field_types) <- cols_occ_file
104+
```{r columns_to_use}
105+
cols_to_use <- cols_only(
106+
"gbifID" = col_double(),
107+
"scientificName" = col_character(),
108+
"kingdom" = col_character(),
109+
"phylum" = col_character(),
110+
"class" = col_character(),
111+
"order" = col_character(),
112+
"family" = col_character(),
113+
"genus" = col_character(),
114+
"specificEpithet" = col_character(),
115+
"infraspecificEpithet" = col_character(),
116+
"taxonRank" = col_character(),
117+
"taxonomicStatus" = col_character(),
118+
"datasetKey" = col_character(),
119+
"basisOfRecord" = col_character(),
120+
"occurrenceStatus" = col_character(),
121+
"lastInterpreted" = col_datetime(),
122+
"hasCoordinate" = col_logical(),
123+
"hasGeospatialIssues" = col_logical(),
124+
"decimalLatitude" = col_double(),
125+
"decimalLongitude" = col_double(),
126+
"coordinateUncertaintyInMeters" = col_double(),
127+
"coordinatePrecision" = col_double(),
128+
"pointRadiusSpatialFit" = col_double(),
129+
"verbatimCoordinateSystem" = col_character(),
130+
"verbatimSRS" = col_character(),
131+
"eventDate" = col_date(),
132+
"startDayOfYear" = col_double(),
133+
"endDayOfYear" = col_double(),
134+
"year" = col_double(),
135+
"month" = col_double(),
136+
"day" = col_double(),
137+
"verbatimEventDate" = col_character(),
138+
"samplingProtocol" = col_character(),
139+
"samplingEffort" = col_character(),
140+
"issue" = col_character(),
141+
"identificationVerificationStatus" = col_character(),
142+
"taxonKey" = col_double(),
143+
"acceptedTaxonKey" = col_double(),
144+
"kingdomKey" = col_double(),
145+
"phylumKey" = col_double(),
146+
"classKey" = col_double(),
147+
"orderKey" = col_double(),
148+
"familyKey" = col_double(),
149+
"genusKey" = col_double(),
150+
"subgenusKey" = col_double(),
151+
"speciesKey" = col_double(),
152+
"species" = col_character()
153+
)
108154
```
109155

110-
The following columns should be of storage class `INTEGER`:
111-
112-
1. `*Key`, e.g. `taxonKey`, `speciesKey`, but no `datasetKey`
113-
2. `*DayOfYear`: `startDayOfYear` and `endDayOfYear`
114-
3. `year`
115-
4. `month`
116-
5. `day`
156+
Columns not present in csv:
117157

118-
```{r set_to_integer}
119-
int_fields <- names(field_types)[str_detect(names(field_types), "Key") &
120-
names(field_types) != "datasetKey"]
121-
int_fields <- c(
122-
int_fields,
123-
names(field_types)[str_detect(names(field_types), "DayOfYear")],
124-
names(field_types)[names(field_types) == "year"],
125-
names(field_types)[names(field_types) == "month"],
126-
names(field_types)[names(field_types) == "day"]
127-
)
128-
field_types[which(names(field_types) %in% int_fields)] <- "INTEGER"
158+
```{r cols_not_present}
159+
names(cols_to_use$cols)[!names(cols_to_use$cols) %in% cols_occ_file]
129160
```
130161

131-
The following columns should be of storage class `REAL`:
132-
133-
1. `decimal*`: `decimalLatitude` and `decimalLongitude`
134-
2. `coordinate*`: `coordinateUncertaintyInMeters` and `coordinatePrecision`
135-
3. `pointRadiusSpatialFit`
162+
### Write csv to sqlite
136163

137-
```{r set_to_real}
138-
real_fields <- names(field_types)[str_detect(names(field_types), "decimal")]
139-
real_fields <- c(
140-
real_fields,
141-
names(field_types)[str_detect(names(field_types), "coordinate")],
142-
names(field_types)[names(field_types) == "pointRadiusSpatialFit"]
164+
```{r csv_to_sqlite}
165+
csv_to_sqlite(csv_file = occ_path,
166+
sqlite_file = sqlite_path,
167+
table_name = table_name,
168+
pre_process_size = 50000,
169+
chunk_size = 10000,
170+
delim = "\t",
171+
col_types = cols_to_use
143172
)
144-
field_types[which(names(field_types) %in% real_fields)] <- "REAL"
145173
```
146174

175+
## Overview
147176
Open connection to database:
148177

149178
```{r open_connection_to_db}
150179
sqlite_occ <- dbConnect(SQLite(), dbname = sqlite_path)
151180
```
152181

153-
Fill database with occurrences from text file. This step reads the large occurrence file in chunks and transfers them in the sqlite file. This step can take long the first time you run it:
154-
155-
```{r fill_sqlite_file}
156-
if (!table_name %in% dbListTables(sqlite_occ)) {
157-
dbWriteTable(
158-
conn = sqlite_occ,
159-
name = table_name,
160-
sep = "\t",
161-
value = occ_path,
162-
row.names = FALSE,
163-
header = TRUE,
164-
field.types = field_types,
165-
overwrite = TRUE
166-
)
167-
}
168-
```
169-
170-
## Overview
171-
172182
Number of columns present:
173183

174184
```{r check_fields_present}
175185
cols_occ_db <- dbListFields(sqlite_occ, table_name)
176186
length(cols_occ_db)
177187
```
178188

189+
Check: selected columns not in the SQLite database:
190+
191+
```{r cols_not_present}
192+
names(cols_to_use$cols)[!names(cols_to_use$cols) %in% cols_occ_db]
193+
```
194+
195+
Check: columns in SQLite database not in selected columns:
196+
197+
```{r}
198+
cols_occ_db[!cols_occ_db %in% names(cols_to_use$cols)]
199+
```
200+
179201
Number of occurrences:
180202

181203
```{r n_occs_raw}
@@ -189,52 +211,20 @@ n_occs_total <- n_occs_total$`COUNT()`
189211
n_occs_total
190212
```
191213

192-
# Filter data
193-
194-
## Define columns to select
214+
Preview first 100 rows from table `occ_all`:
195215

196-
We define a subset of columns, `cols_to_use`, we are interested to:
197-
198-
```{r columns_to_use}
199-
cols_to_use <- c(
200-
"gbifID", "scientificName", "kingdom", "phylum", "class", "order", "family",
201-
"genus", "specificEpithet", "infraspecificEpithet", "taxonRank",
202-
"taxonomicStatus", "datasetKey", "basisOfRecord", "occurrenceStatus",
203-
"lastInterpreted", "hasCoordinate", "hasGeospatialIssues", "decimalLatitude",
204-
"decimalLongitude", "coordinateUncertaintyInMeters", "coordinatePrecision",
205-
"pointRadiusSpatialFit", "verbatimCoordinateSystem", "verbatimSRS",
206-
"eventDate", "startDayOfYear", "endDayOfYear", "year", "month", "day",
207-
"verbatimEventDate", "samplingProtocol", "samplingEffort", "issue",
208-
"identificationVerificationStatus", "taxonKey", "acceptedTaxonKey",
209-
"kingdomKey", "phylumKey", "classKey", "orderKey", "familyKey", "genusKey",
210-
"subgenusKey", "speciesKey", "species"
216+
```{r preview_df}
217+
query <- glue_sql("SELECT * FROM {big_table} LIMIT 100",
218+
big_table = table_name,
219+
.con = sqlite_occ
211220
)
221+
preview_df <- dbGetQuery(conn = sqlite_occ, query)
222+
preview_df
212223
```
213224

214-
Columns in occurrence file not in `cols_to_use`:
215-
216-
```{r cols_in_cols_to_use_not_present_in_cols_occ_db}
217-
cols_to_use[which(!cols_to_use %in% cols_occ_db)]
218-
```
219-
220-
will be removed from the selection:
221-
222-
```{r remove_cols_not_in_cols_occ_db}
223-
cols_to_use <- cols_to_use[which(cols_to_use %in% cols_occ_db)]
224-
```
225-
226-
Final number of columns to select:
227225

228-
```{r n_cols_to_use}
229-
length(cols_to_use)
230-
```
231-
232-
Storage class of these columns:
233226

234-
```{r define_field_type_subset}
235-
field_types_subset <- field_types[which(names(field_types) %in% cols_to_use)]
236-
field_types_subset
237-
```
227+
# Filter data
238228

239229
## Define filters on occurrences
240230

@@ -326,12 +316,11 @@ We create the new table with selected columns and filtered data on `occurrenceSt
326316
if (!table_name_subset %in% dbListTables(sqlite_occ)) {
327317
dbCreateTable(conn = sqlite_occ,
328318
name = table_name_subset,
329-
fields = field_types_subset)
319+
fields = preview_df)
330320
query <- glue_sql(
331-
"INSERT INTO {small_table} SELECT {`some_cols`*} FROM {big_table} WHERE
321+
"INSERT INTO {small_table} SELECT * FROM {big_table} WHERE
332322
LOWER(identificationVerificationStatus) NOT IN ({unverified*}) AND LOWER(occurrenceStatus) NOT IN ({bad_status*}) AND ", issue_condition,
333323
small_table = table_name_subset,
334-
some_cols = names(field_types_subset),
335324
big_table = table_name,
336325
unverified = identificationVerificationStatus_to_discard,
337326
bad_status = occurrenceStatus_to_discard,

0 commit comments

Comments
 (0)