Skip to content

Commit 894eba7

Browse files
committed
refactor bulk to use pandas
1 parent 6aa271e commit 894eba7

File tree

5 files changed

+186
-37
lines changed

5 files changed

+186
-37
lines changed

backend/app/main.py

+11-26
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
from math import ceil
22
import os
3-
import csv
4-
import io
3+
from fastapi.concurrency import run_in_threadpool
4+
import pandas as pd
5+
import numpy as np
56
from contextlib import asynccontextmanager
67
from typing import List, Optional
78

@@ -21,8 +22,6 @@
2122
StatisticsResponse,
2223
)
2324
from app.search import Search, create_es_mapping, get_es
24-
from app.utils import is_float
25-
2625

2726
@asynccontextmanager
2827
async def lifespan(_: FastAPI, es: Optional[Search] = Depends(get_es)):
@@ -103,28 +102,14 @@ async def clear_index_endpoint(
103102
}
104103

105104

106-
def csv_row_generator(upload_file):
107-
with io.TextIOWrapper(
108-
upload_file.file,
109-
# using utf-8-sig encoding as suggested by https://github.com/clld/clldutils/issues/65#issuecomment-344953000
110-
encoding="utf-8-sig",
111-
newline="",
112-
) as text_file:
113-
csv_reader = csv.reader(text_file, delimiter=";")
114-
115-
headers = next(csv_reader)
116-
117-
for row in csv_reader:
118-
data = {}
119-
for index, header in enumerate(headers):
120-
value = row[index]
121-
122-
if is_float(value):
123-
value = float(value.replace(",", "."))
124-
125-
data[header.lower()] = value
126-
127-
yield {"_index": "politicians", **data}
105+
async def csv_row_generator(upload_file):
106+
df = await run_in_threadpool(pd.read_csv, upload_file.file, delimiter=";", decimal=",", engine="c", encoding="utf-8-sig")
107+
df = df.replace(np.nan, None)
108+
df = df.rename(lambda x: x.lower(), axis='columns')
109+
110+
for data in df.to_dict(orient="records"):
111+
yield {"_index": "politicians", **data}
112+
128113

129114

130115
@app.post(

backend/app/schemas.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ class Politician(BaseModel):
3535
)
3636
@classmethod
3737
def string_to_float(cls, value: str) -> float:
38-
if value == "":
38+
if value is None:
3939
return 0.0
4040
try:
4141
return float(value)

backend/app/search.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -41,14 +41,14 @@ async def get_es() -> Optional[Search]:
4141
return Search()
4242

4343

44-
type_map: Dict[type, str] = {
45-
str: "keyword",
46-
datetime: "date",
47-
int: "long",
48-
float: "float",
49-
list: "keyword",
50-
dict: "nested",
51-
List[BaseModel]: "nested",
44+
type_map: Dict[type, Dict[str, str]] = {
45+
str: {"type": "keyword", "null_value": ""},
46+
datetime: {"type": "date", "null_value": ""},
47+
int: {"type": "long", "null_value": 0},
48+
float: {"type": "float", "null_value": 0},
49+
list: {"type": "keyword", "null_value": ""},
50+
dict: {"type": "nested"},
51+
List[BaseModel]: {"type": "nested"}
5252
}
5353

5454

@@ -79,5 +79,5 @@ def create_es_mapping(pydantic_model: BaseModel) -> Dict[str, str]:
7979
):
8080
mapping[field] = {"type": "text", "fields": {"raw": {"type": "keyword"}}}
8181
else:
82-
mapping[field] = {"type": es_field_type}
82+
mapping[field] = {"type": es_field_type.get('type'), "null_value": es_field_type.get('null_value')}
8383
return mapping

backend/poetry.lock

+163-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

backend/pyproject.toml

+2
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ fastapi = "^0.110.1"
1111
uvicorn = { extras = ["standard"], version = "^0.29.0" }
1212
elasticsearch = { extras = ["async"], version = "^8.13.0" }
1313
python-multipart = "^0.0.9"
14+
pandas = "^2.2.2"
15+
numpy = "^1.26.4"
1416

1517

1618
[tool.poetry.group.dev.dependencies]

0 commit comments

Comments
 (0)