diff --git a/article_dataset_builder/harvest.py b/article_dataset_builder/harvest.py index e172447..5939bcd 100644 --- a/article_dataset_builder/harvest.py +++ b/article_dataset_builder/harvest.py @@ -108,7 +108,7 @@ def _load_config(self, path='./config.json'): # test if GROBID is up and running, except if we just want to download raw files if self.apply_grobid: - the_url = _grobid_url(self.config['grobid_base'], self.config['grobid_port']) + the_url = _grobid_url(self.config['grobid_base']) the_url += "isalive" try: r = requests.get(the_url) @@ -449,7 +449,7 @@ def run_grobid(self, pdf_file, output=None, annotation_output=None): ) } - the_url = _grobid_url(self.config['grobid_base'], self.config['grobid_port']) + the_url = _grobid_url(self.config['grobid_base']) the_url += "processFulltextDocument" # set the GROBID parameters @@ -496,7 +496,7 @@ def run_grobid(self, pdf_file, output=None, annotation_output=None): ) } - the_url = _grobid_url(self.config['grobid_base'], self.config['grobid_port']) + the_url = _grobid_url(self.config['grobid_base']) the_url += "referenceAnnotations" # set the GROBID parameters @@ -1458,11 +1458,8 @@ def _biblio_glutton_url(biblio_glutton_url): res = biblio_glutton_url[:-1] return res+"/service/lookup?" -def _grobid_url(grobid_base, grobid_port): - the_url = 'http://'+grobid_base - if grobid_port is not None and len(grobid_port)>0: - the_url += ":"+grobid_port - the_url += "/api/" +def _grobid_url(grobid_url): + the_url = grobid_url + "/api/" return the_url def _download(url, filename): diff --git a/config.json b/config.json index 27f95f5..3f849b2 100644 --- a/config.json +++ b/config.json @@ -14,8 +14,7 @@ "biblio_glutton_base": "http://cloud.science-miner.com/glutton", "crossref_base": "https://api.crossref.org", "crossref_email": "", - "grobid_base": "localhost", - "grobid_port": "8070", + "grobid_base": "http://localhost:8070", "cord19_elsevier_map_path": "resources/elsevier_covid_map_26-07-2021.csv.gz", "cord19_elsevier_pdf_path": "", "pub2tei_path": ""