@@ -97,85 +97,107 @@ And table name:
97
97
table_name <- "occ_all"
98
98
```
99
99
100
- ### Define storage class for each column
100
+ ### Define columns to read
101
101
102
- The standard storage class is ` TEXT ` :
102
+ We define a subset of columns, ` cols_to_use ` , we are interested to, and their type :
103
103
104
- ``` {r def_default_cols_type}
105
- # default type: TEXT
106
- field_types <- rep("TEXT", length(cols_occ_file))
107
- names(field_types) <- cols_occ_file
104
+ ``` {r columns_to_use}
105
+ cols_to_use <- cols_only(
106
+ "gbifID" = col_double(),
107
+ "scientificName" = col_character(),
108
+ "kingdom" = col_character(),
109
+ "phylum" = col_character(),
110
+ "class" = col_character(),
111
+ "order" = col_character(),
112
+ "family" = col_character(),
113
+ "genus" = col_character(),
114
+ "specificEpithet" = col_character(),
115
+ "infraspecificEpithet" = col_character(),
116
+ "taxonRank" = col_character(),
117
+ "taxonomicStatus" = col_character(),
118
+ "datasetKey" = col_character(),
119
+ "basisOfRecord" = col_character(),
120
+ "occurrenceStatus" = col_character(),
121
+ "lastInterpreted" = col_datetime(),
122
+ "hasCoordinate" = col_logical(),
123
+ "hasGeospatialIssues" = col_logical(),
124
+ "decimalLatitude" = col_double(),
125
+ "decimalLongitude" = col_double(),
126
+ "coordinateUncertaintyInMeters" = col_double(),
127
+ "coordinatePrecision" = col_double(),
128
+ "pointRadiusSpatialFit" = col_double(),
129
+ "verbatimCoordinateSystem" = col_character(),
130
+ "verbatimSRS" = col_character(),
131
+ "eventDate" = col_date(),
132
+ "startDayOfYear" = col_double(),
133
+ "endDayOfYear" = col_double(),
134
+ "year" = col_double(),
135
+ "month" = col_double(),
136
+ "day" = col_double(),
137
+ "verbatimEventDate" = col_character(),
138
+ "samplingProtocol" = col_character(),
139
+ "samplingEffort" = col_character(),
140
+ "issue" = col_character(),
141
+ "identificationVerificationStatus" = col_character(),
142
+ "taxonKey" = col_double(),
143
+ "acceptedTaxonKey" = col_double(),
144
+ "kingdomKey" = col_double(),
145
+ "phylumKey" = col_double(),
146
+ "classKey" = col_double(),
147
+ "orderKey" = col_double(),
148
+ "familyKey" = col_double(),
149
+ "genusKey" = col_double(),
150
+ "subgenusKey" = col_double(),
151
+ "speciesKey" = col_double(),
152
+ "species" = col_character()
153
+ )
108
154
```
109
155
110
- The following columns should be of storage class ` INTEGER ` :
111
-
112
- 1 . ` *Key ` , e.g. ` taxonKey ` , ` speciesKey ` , but no ` datasetKey `
113
- 2 . ` *DayOfYear ` : ` startDayOfYear ` and ` endDayOfYear `
114
- 3 . ` year `
115
- 4 . ` month `
116
- 5 . ` day `
156
+ Columns not present in csv:
117
157
118
- ``` {r set_to_integer}
119
- int_fields <- names(field_types)[str_detect(names(field_types), "Key") &
120
- names(field_types) != "datasetKey"]
121
- int_fields <- c(
122
- int_fields,
123
- names(field_types)[str_detect(names(field_types), "DayOfYear")],
124
- names(field_types)[names(field_types) == "year"],
125
- names(field_types)[names(field_types) == "month"],
126
- names(field_types)[names(field_types) == "day"]
127
- )
128
- field_types[which(names(field_types) %in% int_fields)] <- "INTEGER"
158
+ ``` {r cols_not_present}
159
+ names(cols_to_use$cols)[!names(cols_to_use$cols) %in% cols_occ_file]
129
160
```
130
161
131
- The following columns should be of storage class ` REAL ` :
132
-
133
- 1 . ` decimal* ` : ` decimalLatitude ` and ` decimalLongitude `
134
- 2 . ` coordinate* ` : ` coordinateUncertaintyInMeters ` and ` coordinatePrecision `
135
- 3 . ` pointRadiusSpatialFit `
162
+ ### Write csv to sqlite
136
163
137
- ``` {r set_to_real}
138
- real_fields <- names(field_types)[str_detect(names(field_types), "decimal")]
139
- real_fields <- c(
140
- real_fields,
141
- names(field_types)[str_detect(names(field_types), "coordinate")],
142
- names(field_types)[names(field_types) == "pointRadiusSpatialFit"]
164
+ ``` {r csv_to_sqlite}
165
+ csv_to_sqlite(csv_file = occ_path,
166
+ sqlite_file = sqlite_path,
167
+ table_name = table_name,
168
+ pre_process_size = 50000,
169
+ chunk_size = 10000,
170
+ delim = "\t",
171
+ col_types = cols_to_use
143
172
)
144
- field_types[which(names(field_types) %in% real_fields)] <- "REAL"
145
173
```
146
174
175
+ ## Overview
147
176
Open connection to database:
148
177
149
178
``` {r open_connection_to_db}
150
179
sqlite_occ <- dbConnect(SQLite(), dbname = sqlite_path)
151
180
```
152
181
153
- Fill database with occurrences from text file. This step reads the large occurrence file in chunks and transfers them in the sqlite file. This step can take long the first time you run it:
154
-
155
- ``` {r fill_sqlite_file}
156
- if (!table_name %in% dbListTables(sqlite_occ)) {
157
- dbWriteTable(
158
- conn = sqlite_occ,
159
- name = table_name,
160
- sep = "\t",
161
- value = occ_path,
162
- row.names = FALSE,
163
- header = TRUE,
164
- field.types = field_types,
165
- overwrite = TRUE
166
- )
167
- }
168
- ```
169
-
170
- ## Overview
171
-
172
182
Number of columns present:
173
183
174
184
``` {r check_fields_present}
175
185
cols_occ_db <- dbListFields(sqlite_occ, table_name)
176
186
length(cols_occ_db)
177
187
```
178
188
189
+ Check: selected columns not in the SQLite database:
190
+
191
+ ``` {r cols_not_present}
192
+ names(cols_to_use$cols)[!names(cols_to_use$cols) %in% cols_occ_db]
193
+ ```
194
+
195
+ Check: columns in SQLite database not in selected columns:
196
+
197
+ ``` {r}
198
+ cols_occ_db[!cols_occ_db %in% names(cols_to_use$cols)]
199
+ ```
200
+
179
201
Number of occurrences:
180
202
181
203
``` {r n_occs_raw}
@@ -189,52 +211,20 @@ n_occs_total <- n_occs_total$`COUNT()`
189
211
n_occs_total
190
212
```
191
213
192
- # Filter data
193
-
194
- ## Define columns to select
214
+ Preview first 100 rows from table ` occ_all ` :
195
215
196
- We define a subset of columns, ` cols_to_use ` , we are interested to:
197
-
198
- ``` {r columns_to_use}
199
- cols_to_use <- c(
200
- "gbifID", "scientificName", "kingdom", "phylum", "class", "order", "family",
201
- "genus", "specificEpithet", "infraspecificEpithet", "taxonRank",
202
- "taxonomicStatus", "datasetKey", "basisOfRecord", "occurrenceStatus",
203
- "lastInterpreted", "hasCoordinate", "hasGeospatialIssues", "decimalLatitude",
204
- "decimalLongitude", "coordinateUncertaintyInMeters", "coordinatePrecision",
205
- "pointRadiusSpatialFit", "verbatimCoordinateSystem", "verbatimSRS",
206
- "eventDate", "startDayOfYear", "endDayOfYear", "year", "month", "day",
207
- "verbatimEventDate", "samplingProtocol", "samplingEffort", "issue",
208
- "identificationVerificationStatus", "taxonKey", "acceptedTaxonKey",
209
- "kingdomKey", "phylumKey", "classKey", "orderKey", "familyKey", "genusKey",
210
- "subgenusKey", "speciesKey", "species"
216
+ ``` {r preview_df}
217
+ query <- glue_sql("SELECT * FROM {big_table} LIMIT 100",
218
+ big_table = table_name,
219
+ .con = sqlite_occ
211
220
)
221
+ preview_df <- dbGetQuery(conn = sqlite_occ, query)
222
+ preview_df
212
223
```
213
224
214
- Columns in occurrence file not in ` cols_to_use ` :
215
-
216
- ``` {r cols_in_cols_to_use_not_present_in_cols_occ_db}
217
- cols_to_use[which(!cols_to_use %in% cols_occ_db)]
218
- ```
219
-
220
- will be removed from the selection:
221
-
222
- ``` {r remove_cols_not_in_cols_occ_db}
223
- cols_to_use <- cols_to_use[which(cols_to_use %in% cols_occ_db)]
224
- ```
225
-
226
- Final number of columns to select:
227
225
228
- ``` {r n_cols_to_use}
229
- length(cols_to_use)
230
- ```
231
-
232
- Storage class of these columns:
233
226
234
- ``` {r define_field_type_subset}
235
- field_types_subset <- field_types[which(names(field_types) %in% cols_to_use)]
236
- field_types_subset
237
- ```
227
+ # Filter data
238
228
239
229
## Define filters on occurrences
240
230
@@ -326,12 +316,11 @@ We create the new table with selected columns and filtered data on `occurrenceSt
326
316
if (!table_name_subset %in% dbListTables(sqlite_occ)) {
327
317
dbCreateTable(conn = sqlite_occ,
328
318
name = table_name_subset,
329
- fields = field_types_subset )
319
+ fields = preview_df )
330
320
query <- glue_sql(
331
- "INSERT INTO {small_table} SELECT {`some_cols`*} FROM {big_table} WHERE
321
+ "INSERT INTO {small_table} SELECT * FROM {big_table} WHERE
332
322
LOWER(identificationVerificationStatus) NOT IN ({unverified*}) AND LOWER(occurrenceStatus) NOT IN ({bad_status*}) AND ", issue_condition,
333
323
small_table = table_name_subset,
334
- some_cols = names(field_types_subset),
335
324
big_table = table_name,
336
325
unverified = identificationVerificationStatus_to_discard,
337
326
bad_status = occurrenceStatus_to_discard,
0 commit comments