refactor bulk to use pandas

Javimtib92 · Javimtib92 · commit 894eba751393 · 2024-04-19T18:11:38.000+02:00
diff --git a/backend/app/main.py b/backend/app/main.py
@@ -1,7 +1,8 @@
 from math import ceil
 import os
-import csv
-import io
+from fastapi.concurrency import run_in_threadpool
+import pandas as pd
+import numpy as np
 from contextlib import asynccontextmanager
 from typing import List, Optional
 
@@ -21,8 +22,6 @@
     StatisticsResponse,
 )
 from app.search import Search, create_es_mapping, get_es
-from app.utils import is_float
-
 
 @asynccontextmanager
 async def lifespan(_: FastAPI, es: Optional[Search] = Depends(get_es)):
@@ -103,28 +102,14 @@ async def clear_index_endpoint(
     }
 
 
-def csv_row_generator(upload_file):
-    with io.TextIOWrapper(
-        upload_file.file,
-        # using utf-8-sig encoding as suggested by https://github.com/clld/clldutils/issues/65#issuecomment-344953000
-        encoding="utf-8-sig",
-        newline="",
-    ) as text_file:
-        csv_reader = csv.reader(text_file, delimiter=";")
-
-        headers = next(csv_reader)
-
-        for row in csv_reader:
-            data = {}
-            for index, header in enumerate(headers):
-                value = row[index]
-
-                if is_float(value):
-                    value = float(value.replace(",", "."))
-
-                data[header.lower()] = value
-
-            yield {"_index": "politicians", **data}
+async def csv_row_generator(upload_file):
+    df = await run_in_threadpool(pd.read_csv, upload_file.file, delimiter=";", decimal=",", engine="c", encoding="utf-8-sig")
+    df = df.replace(np.nan, None)
+    df = df.rename(lambda x: x.lower(), axis='columns')
+    
+    for data in df.to_dict(orient="records"):
+        yield {"_index": "politicians", **data}
+        
 
 
 @app.post(
diff --git a/backend/app/schemas.py b/backend/app/schemas.py
@@ -35,7 +35,7 @@ class Politician(BaseModel):
     )
     @classmethod
     def string_to_float(cls, value: str) -> float:
-        if value == "":
+        if value is None:
             return 0.0
         try:
             return float(value)
diff --git a/backend/app/search.py b/backend/app/search.py
@@ -41,14 +41,14 @@ async def get_es() -> Optional[Search]:
     return Search()
 
 
-type_map: Dict[type, str] = {
-    str: "keyword",
-    datetime: "date",
-    int: "long",
-    float: "float",
-    list: "keyword",
-    dict: "nested",
-    List[BaseModel]: "nested",
+type_map: Dict[type, Dict[str, str]] = {
+    str: {"type": "keyword", "null_value": ""},
+    datetime: {"type": "date", "null_value": ""},
+    int: {"type": "long", "null_value": 0},
+    float: {"type": "float", "null_value": 0},
+    list: {"type": "keyword", "null_value": ""},
+    dict: {"type": "nested"},
+    List[BaseModel]: {"type": "nested"}
 }
 
 
@@ -79,5 +79,5 @@ def create_es_mapping(pydantic_model: BaseModel) -> Dict[str, str]:
         ):
             mapping[field] = {"type": "text", "fields": {"raw": {"type": "keyword"}}}
         else:
-            mapping[field] = {"type": es_field_type}
+            mapping[field] = {"type": es_field_type.get('type'), "null_value": es_field_type.get('null_value')}
     return mapping
diff --git a/backend/poetry.lock b/backend/poetry.lock
diff --git a/backend/pyproject.toml b/backend/pyproject.toml
@@ -11,6 +11,8 @@ fastapi = "^0.110.1"
 uvicorn = { extras = ["standard"], version = "^0.29.0" }
 elasticsearch = { extras = ["async"], version = "^8.13.0" }
 python-multipart = "^0.0.9"
+pandas = "^2.2.2"
+numpy = "^1.26.4"
 
 
 [tool.poetry.group.dev.dependencies]

Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ class Politician(BaseModel):`
`35`	`35`	`)`
`36`	`36`	`@classmethod`
`37`	`37`	`def string_to_float(cls, value: str) -> float:`
`38`		`- if value == "":`
	`38`	`+ if value is None:`
`39`	`39`	`return 0.0`
`40`	`40`	`try:`
`41`	`41`	`return float(value)`