-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
Automatic encoding
- Loading branch information
There are no files selected for viewing
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
This file was deleted.
This file was deleted.
This file was deleted.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
|
||
import json | ||
from typing import List | ||
import numpy as np | ||
import sys | ||
|
||
def sort_entries(entries: List, columns: "List[(int, int, int, int)]", *, | ||
skip_error: bool = True) -> int: | ||
|
||
e_coords = np.array([e["box"] for e in entries], dtype="int") | ||
e_center_x = e_coords[:,0] + e_coords[:,2] // 2 | ||
e_center_y = e_coords[:,1] + e_coords[:,3] // 2 | ||
|
||
buckets = np.array([e["box"] for e in columns], dtype="int") | ||
# (x,y,w,h) => (x0,y0,x1,y1) | ||
x0 = buckets[:,0] | ||
y0 = buckets[:,1] | ||
x1 = buckets[:,0] + buckets[:,2] | ||
y1 = buckets[:,1] + buckets[:,3] | ||
|
||
bucket_has = np.logical_and.reduce([ | ||
x0[:, np.newaxis] <= e_center_x[np.newaxis,:], | ||
x1[:, np.newaxis] > e_center_x[np.newaxis,:], | ||
y0[:, np.newaxis] <= e_center_y[np.newaxis,:], | ||
y1[:, np.newaxis] > e_center_y[np.newaxis,:] | ||
]) | ||
|
||
keys = np.argmax(bucket_has, axis=0) | ||
|
||
# error_txt = "The box ({}) is not included in any columns" | ||
|
||
for i, e in enumerate(entries): | ||
k = int(keys[i]) | ||
if not bucket_has[k, i]: | ||
err = f"The entry {e} is not included in any column." | ||
e["key"] = (np.inf, np.inf) | ||
if not skip_error: | ||
raise RuntimeError(err) | ||
else: | ||
e["key"] = (k, int(e_center_y[i])) | ||
entries.sort(key = lambda e: e["key"]) | ||
|
||
def process(path_in, path_out): | ||
|
||
with open(path_in) as f: | ||
data = json.load(f) | ||
|
||
types_to_sort = ('ENTRY', "TITLE_LEVEL_2") | ||
|
||
entries = [e for e in data if e["type"] in types_to_sort] | ||
columns = [e for e in data if e["type"] == 'COLUMN_LEVEL_1'] | ||
sort_entries(entries, columns) | ||
all = [e for e in data if e["type"] not in types_to_sort] | ||
all.extend(entries) | ||
|
||
with open(path_out, "w") as f: | ||
json.dump(all, f, indent=True, ensure_ascii=False) | ||
|
||
|
||
if len(sys.argv) != 3: | ||
print(""" | ||
Usage: {} input.json output.json | ||
Reorder the entries of a json file. | ||
""" | ||
.format(sys.argv[0])) | ||
exit(1) | ||
|
||
process(sys.argv[1], sys.argv[2]) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,66 +1,119 @@ | ||
import json, os | ||
from script_balisage_formel import add_seg, add_signed | ||
from script_balisage_semantique import add_utterance, add_comment, add_incident, add_quote | ||
from script_nettoyage import nettoyage_saut_ligne | ||
import json, os, re | ||
from lxml import etree | ||
from script_compilation import compilation | ||
from script_nettoyage import clean_xml | ||
from script_metadonnees import var_metadata, build_teiheader | ||
|
||
# Chemin vers les fichiers JSON | ||
path_to_json = '/home/fanny/Documents/AGODA/Docs de travail AGODA Github/Transformations/json_data/' | ||
# Chemins vers les fichiers JSON et les fichiers XML | ||
path = os.path.dirname(__file__) | ||
path_to_json = os.path.join(os.path.abspath(os.path.join(path, os.pardir)), "json_data") | ||
path_to_xml = os.path.join(os.path.abspath(os.path.join(path, os.pardir)), "xml_data") | ||
|
||
beginning_elements = '<?xml version="1.0" encoding="UTF-8"?> <?xml-model ' \ | ||
'href="agoda_schema.rng" ' \ | ||
'type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?> <?xml-model ' \ | ||
'href="agoda_schema.rng" ' \ | ||
'type="application/xml" schematypens="http://purl.oclc.org/dsdl/schematron"?> <TEI ' \ | ||
'xmlns="http://www.tei-c.org/ns/1.0" xml:lang="fr"> ' | ||
end_elements = '</TEI>' | ||
|
||
compteur = 1 | ||
# Boucle permettant de lire chaque fichier JSON dont le nom finit par ".json" contenu dans un dossier dont le chemin | ||
# est donné | ||
|
||
for file_name in sorted([file for file in os.listdir(path_to_json) if file.endswith('.json')]): | ||
with open(os.path.join(path_to_json, file_name), encoding='utf-8') as json_file: | ||
data = json.load(json_file) | ||
|
||
# Fonction principale permettant d'appliquer l'ensemble des définitions définies dans les autres scripts | ||
def main(x,compteur): | ||
""" | ||
Expliquer fonction | ||
:return: | ||
""" | ||
# définir zwt | ||
for i in range(len(data)): | ||
if "comment" in data[i]: | ||
if re.search(r"body[^1]", data[i]["comment"]): | ||
inc = 0 | ||
zwt = int(data[i]['text_ocr'].split()[-1]) | ||
elif re.search(r"page-number", data[i]["comment"]) and not re.search(r"body", data[i]["comment"]): | ||
data[i]['text_ocr'] = "".join(['<pb n="', str(zwt + inc), '"/>']) | ||
inc += 1 | ||
elif re.search(r"page-number-ref", data[i]["comment"]): | ||
data[i]['text_ocr'] = "".join(['<ref target="#', str(zwt + inc), '"/>']) | ||
inc += 1 | ||
|
||
# Application de la fonction main | ||
date_pub, meetings, meeting_sitting, date_sitting = var_metadata(data) | ||
header = build_teiheader(date_pub, meetings, meeting_sitting, date_sitting) | ||
|
||
# Appeler les définitions classées par thématiques dans un ordre bien précis | ||
add_quote(x) | ||
add_incident(x) | ||
add_seg(x) | ||
add_comment(x) | ||
add_utterance(x) | ||
add_signed(x) | ||
nettoyage_saut_ligne(x) | ||
compilation(data, zwt, inc) | ||
|
||
# Gestion des éléments obligatoires en XML TEI à ajouter (élément racine par exemple) ici ? | ||
|
||
# Écriture du contenu de data dans les JSON - étape intermédiaire à enlever par la suite | ||
with open('essai' + str(compteur) + '.json', 'w') as f: | ||
json.dump(x, f) | ||
# Création fichier .xml en mode écriture | ||
for i in range(len(data)): | ||
if "comment" in data[i]: | ||
if "body" in data[i]["comment"]: | ||
|
||
return | ||
name = json_file | ||
file_name1 = re.split(r"_p", file_name)[0] | ||
output_xml = open(str(os.path.join(path_to_xml, file_name1 + ".xml")), mode="w") | ||
|
||
output_xml.write(beginning_elements) | ||
output_xml.write(header) | ||
|
||
# Boucle permettant d'écrire dans le fichier .xml tous les text_ocr de data | ||
if "text_ocr" in data[i]: | ||
if len(data[i]["text_ocr"]) > 0: | ||
output_xml.write(data[i]['text_ocr']) | ||
|
||
# ajouter espace entre boxes | ||
output_xml.write(" ") | ||
|
||
# Boucle permettant de lire chaque fichier JSON dont le nom finit par ".json" contenu dans un dossier dont le chemin est donné | ||
for file_name in [file for file in os.listdir(path_to_json) if file.endswith('.json')]: | ||
with open(path_to_json + file_name) as json_file: | ||
data = json.load(json_file) | ||
if "comment" in data[i]: | ||
if "text" in data[i]["comment"]: | ||
output_xml.write(end_elements) | ||
output_xml.close() | ||
|
||
# Application de la fonction main | ||
main(data, compteur) | ||
compteur +=1 | ||
# vérification du schéma TEI --> à gérer | ||
|
||
# Création fichier .xml en mode écriture | ||
for i in range(len(data)): | ||
if "comment" in data[i]: | ||
if "body" in data[i]["comment"]: | ||
name = json_file | ||
output_xml = open(str(file_name)+".xml", mode="w") | ||
# Boucle permettant d'écrire dans le fichier .xml tous les text_ocr de data | ||
if "text_ocr" in data[i]: | ||
if len(data[i]["text_ocr"]) > 0: | ||
output_xml.write(data[i]['text_ocr']) | ||
|
||
# autre méthode pour la création de fichiers .xml ? | ||
# écrire dans un fichier pour chacune des séances --> à gérer | ||
# nommer chaque fichier d'une certaine façon --> à gérer | ||
# vérification du schéma TEI --> à gérer | ||
# Nettoyage des fichiers xml | ||
|
||
output_xml.close() | ||
clean_xml(path_to_xml) | ||
|
||
# --------------------------------------------------------------------------------------------------------------------- | ||
|
||
# Ajout des xi:include dans le teiCorpus | ||
|
||
my_tree = etree.parse(os.path.join(path_to_xml, 'FR_3R_5L.xml')) | ||
my_root = my_tree.getroot() | ||
namespace = 'http://www.w3.org/2001/XInclude' | ||
|
||
|
||
for file_name in sorted([file for file in os.listdir(path_to_xml) if file.endswith('.xml')]): | ||
if len(str(file_name)) > 12: | ||
f = open(os.path.join(path_to_xml,'FR_3R_5L.xml'),mode = "r") | ||
text = f.read() | ||
if re.search(file_name,text) : | ||
print("ok") | ||
continue | ||
|
||
#if not | ||
|
||
# Ajout des xi:include en allant chercher les noms des fichiers xml | ||
|
||
my_root.append(etree.Element(etree.QName(namespace, 'include'), nsmap={'xi':namespace}, | ||
attrib={'href':str(file_name)})) | ||
|
||
# Sauvegarde | ||
file_to_save = os.path.join(path_to_xml, 'FR_3R_5L.xml') | ||
my_tree.write(file_to_save, pretty_print=True, encoding='utf-8', xml_declaration=True) | ||
|
||
# Vérification du respect du schéma relaxNG | ||
|
||
# # Ouverture du fichier rng | ||
# relaxng_doc = etree.parse(path_to_xml + '/agoda_schema.rng') | ||
# | ||
# # Association comme étant un schéma relaxNG | ||
# relaxng = etree.RelaxNG(relaxng_doc) | ||
# | ||
# # Vérification des erreurs sur l'ensemble des fichiers XML | ||
# relaxng.assertValid(my_tree) | ||
|
||
# --------------------------------------------------------------------------------------------------------------------- | ||
print(my_root) |