diff --git a/.github/workflows/integrate.yaml b/.github/workflows/integrate.yaml index 60aa5dd..f13b1fa 100644 --- a/.github/workflows/integrate.yaml +++ b/.github/workflows/integrate.yaml @@ -28,11 +28,12 @@ jobs: uses: actions/setup-python@v1 with: python-version: ${{ matrix.python-version }} + - name: Install dependencies run: | python -m pip install --upgrade pip - pip install flake8 pytest - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + make install + - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names diff --git a/Makefile b/Makefile index d16afc7..3adef00 100644 --- a/Makefile +++ b/Makefile @@ -15,3 +15,17 @@ deploy: build stop: $(compose_cmd) stop + +install: + pip install -e .['develop'] + +test: + pytest crawlclima/ -v + +clean: + @find ./ -name '*.pyc' -exec rm -f {} \; + @find ./ -name '*~' -exec rm -f {} \; + rm -rf .cache + rm -rf build + rm -rf dist + rm -rf *.egg-info diff --git a/crawlclima/redemet/rmet.py b/crawlclima/redemet/rmet.py index 0458572..4eda3b7 100644 --- a/crawlclima/redemet/rmet.py +++ b/crawlclima/redemet/rmet.py @@ -1,6 +1,7 @@ import datetime import math import os +import re import time import pandas as pd @@ -148,16 +149,18 @@ def describe(dataframe): return data -def capture_date_range(station, date): +def capture_date_range(station, date_start, date_end=None): """ Baixa dados da estação específica a partir da data especificada até a data de hoje :param station: código da estação - :param date: data de início da captura + :param date_start: data de início da captura + :param date_end: data final captura :return: """ - today = datetime.datetime.today() + if date_end is None: + date_end = datetime.datetime.today() check_day_station = lambda d: check_day(d, station) - dates = filter(check_day_station, date_generator(today, date)) + dates = filter(check_day_station, date_generator(date_end, date_start)) return list(filter(len, map(lambda d: capture(station, d), dates))) @@ -179,17 +182,26 @@ def capture(station, date): """ url = redemet_url(station, date) status = 0 - wait = 1 + wait = 3 while status != 200 and wait <= 16: resp = requests.get(url) status = resp.status_code time.sleep(wait) - wait *= 2 + wait *= 3 resp_data = resp.json() + with open('logs/capture-rmet.log', 'a') as f: + f.write("{}".format(resp_data['data']["data"])) + page = '' for dados in resp_data["data"]["data"]: mensagem = dados['mens'] + # check if there is more cases that pattern should treat(#53) + pattern = re.compile(r' [WSNE][0-9]{1,2}/[WSNE][0-9]{1,2}') + result = pattern.findall(mensagem) + for r in result: + mensagem = mensagem.replace(r, '') + date_receive = dados['recebimento'] # format date date_time_str = datetime.datetime.strptime( @@ -200,9 +212,11 @@ def capture(station, date): dataframe = parse_page(page) data = describe(dataframe) + if len(data) == 0: - logger.warning("Empty data for %s", date) + logger.warning("Empty data for %s in %s", station, date) return {} + data["date"] = date data["station"] = station diff --git a/crawlclima/tasks.py b/crawlclima/tasks.py index fc1676b..88b6c6c 100644 --- a/crawlclima/tasks.py +++ b/crawlclima/tasks.py @@ -1,4 +1,6 @@ import csv +import os +import sys import time from datetime import datetime, timedelta from io import StringIO @@ -23,6 +25,10 @@ logger = get_task_logger("Captura") +work_dir = os.getcwd() +route_abs = os.path.dirname(os.path.abspath(work_dir)) +sys.path.insert(0, route_abs) + def get_connection(): try: @@ -199,12 +205,17 @@ def fetch_redemet(self, station, date): data = capture_date_range(station, date) except Exception as e: logger.error( - "Error fetching from {} at {}: {}".format(station, date, e) + "Error fetching from {} at {} data is {}: error: {}".format( + station, date, data, e + ) ) + return try: - logger.info("Saving {}".format(station)) if len(data) > 0: save(data, schema="Municipio", table="Clima_wu") + logger.info("Saving {}".format(station)) + else: + logger.info("No data found {}".format(station)) except Exception as e: logger.error( "Error saving to db with {} at {}: {}".format(station, date, e) diff --git a/requirements-dev.txt b/requirements-dev.txt index d436ce5..96c24a7 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,9 +1,11 @@ --r requirements.txt +# -r requirements.txt nose==1.3.7 responses pgcli black -flake8==3.5.0 +flake8==3.7.9 isort==4.3.19 pre-commit==2.4.0 - +pandas +pytest +pyflakes==2.1.0 diff --git a/setup.py b/setup.py index c02e5ad..30050b3 100644 --- a/setup.py +++ b/setup.py @@ -1,9 +1,14 @@ -import setuptools +from setuptools import find_packages, setup with open("README.md", "r") as fh: long_description = fh.read() -setuptools.setup( + +def read(filename): + return [req.strip() for req in open(filename).readlines()] + + +setup( name="crawlclima", version="0.1.0", author="fccoelho", @@ -12,11 +17,13 @@ long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/AlertaDengue/AlertaDengueCaptura.git", - packages=setuptools.find_packages(), + packages=find_packages(), classifiers=[ "Programming Language :: Python :: 3", "License :: OSI Approved :: GPL V3 License", "Operating System :: Linux", ], python_requires='>=3.7', + install_requires=read("requirements.txt"), + extras_require={'develop': read("requirements-dev.txt")}, )