Skip to content

Commit

Permalink
Merge pull request #3 from mpuren/automatic_encoding
Browse files Browse the repository at this point in the history
Automatic encoding
  • Loading branch information
atomegoyan authored Jul 29, 2022
2 parents 972d9a6 + 1cef216 commit 54732dd
Show file tree
Hide file tree
Showing 90 changed files with 122,909 additions and 36,770 deletions.
5,273 changes: 5,273 additions & 0 deletions json_data/FR_3R_5L_1889-11-26_p0175.json

Large diffs are not rendered by default.

4,018 changes: 4,018 additions & 0 deletions json_data/FR_3R_5L_1889-11-26_p0176.json

Large diffs are not rendered by default.

3,740 changes: 3,740 additions & 0 deletions json_data/FR_3R_5L_1889-11-26_p0177.json

Large diffs are not rendered by default.

3,761 changes: 3,761 additions & 0 deletions json_data/FR_3R_5L_1889-11-26_p0178.json

Large diffs are not rendered by default.

4,460 changes: 4,460 additions & 0 deletions json_data/FR_3R_5L_1889-11-26_p0179.json

Large diffs are not rendered by default.

4,657 changes: 4,657 additions & 0 deletions json_data/FR_3R_5L_1889-11-26_p0180.json

Large diffs are not rendered by default.

4,355 changes: 4,355 additions & 0 deletions json_data/FR_3R_5L_1889-11-26_p0181.json

Large diffs are not rendered by default.

5,011 changes: 5,011 additions & 0 deletions json_data/FR_3R_5L_1889-11-26_p0182.json

Large diffs are not rendered by default.

5,079 changes: 5,079 additions & 0 deletions json_data/FR_3R_5L_1889-11-26_p0183.json

Large diffs are not rendered by default.

4,975 changes: 4,975 additions & 0 deletions json_data/FR_3R_5L_1889-11-26_p0184.json

Large diffs are not rendered by default.

4,889 changes: 4,889 additions & 0 deletions json_data/FR_3R_5L_1889-11-26_p0185.json

Large diffs are not rendered by default.

4,950 changes: 4,950 additions & 0 deletions json_data/FR_3R_5L_1889-11-26_p0186.json

Large diffs are not rendered by default.

4,886 changes: 4,886 additions & 0 deletions json_data/FR_3R_5L_1889-11-26_p0187.json

Large diffs are not rendered by default.

4,736 changes: 4,736 additions & 0 deletions json_data/FR_3R_5L_1889-11-26_p0188.json

Large diffs are not rendered by default.

5,149 changes: 5,149 additions & 0 deletions json_data/FR_3R_5L_1889-11-26_p0189.json

Large diffs are not rendered by default.

5,761 changes: 5,761 additions & 0 deletions json_data/FR_3R_5L_1889-11-26_p0190.json

Large diffs are not rendered by default.

5,003 changes: 5,003 additions & 0 deletions json_data/FR_3R_5L_1889-11-26_p0191.json

Large diffs are not rendered by default.

5,743 changes: 5,743 additions & 0 deletions json_data/FR_3R_5L_1889-11-26_p0192.json

Large diffs are not rendered by default.

5,803 changes: 5,803 additions & 0 deletions json_data/FR_3R_5L_1889-11-26_p0193.json

Large diffs are not rendered by default.

4,768 changes: 0 additions & 4,768 deletions json_data/FR_3R_5L_1889-11-26_p175.json

This file was deleted.

3,840 changes: 0 additions & 3,840 deletions json_data/FR_3R_5L_1889-11-26_p176.json

This file was deleted.

3,724 changes: 0 additions & 3,724 deletions json_data/FR_3R_5L_1889-11-26_p178.json

This file was deleted.

3,547 changes: 3,547 additions & 0 deletions json_data/FR_3R_5L_1890-01-14_p0001.json

Large diffs are not rendered by default.

5,482 changes: 5,482 additions & 0 deletions json_data/FR_3R_5L_1890-01-14_p0002.json

Large diffs are not rendered by default.

4,854 changes: 4,854 additions & 0 deletions json_data/FR_3R_5L_1890-01-14_p0003.json

Large diffs are not rendered by default.

4,083 changes: 4,083 additions & 0 deletions json_data/FR_3R_5L_1890-01-14_p0004.json

Large diffs are not rendered by default.

1,109 changes: 1,109 additions & 0 deletions json_data/FR_3R_5L_1890-01-14_p0005.json

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion json_data/a-verifier/zz-JO-debats-14011890_0007.json

This file was deleted.

1 change: 0 additions & 1 deletion json_data/a-verifier/zz_27111889_jo_debats_0001.json

This file was deleted.

1 change: 0 additions & 1 deletion json_data/a-verifier/zz_27111889_jo_debats_0003.json

This file was deleted.

1 change: 0 additions & 1 deletion json_data/a-verifier/zz_27111889_jo_debats_0005.json

This file was deleted.

1 change: 0 additions & 1 deletion json_data/a-verifier/zz_27111889_jo_debats_0006.json

This file was deleted.

1 change: 0 additions & 1 deletion json_data/a-verifier/zz_27111889_jo_debats_0007.json

This file was deleted.

1 change: 0 additions & 1 deletion json_data/a-verifier/zz_27111889_jo_debats_0008.json

This file was deleted.

1 change: 0 additions & 1 deletion json_data/a-verifier/zz_27111889_jo_debats_0009.json

This file was deleted.

1 change: 0 additions & 1 deletion json_data/a-verifier/zz_27111889_jo_debats_0010.json

This file was deleted.

1 change: 0 additions & 1 deletion json_data/a-verifier/zz_27111889_jo_debats_0011.json

This file was deleted.

1 change: 0 additions & 1 deletion json_data/a-verifier/zz_27111889_jo_debats_0012.json

This file was deleted.

1 change: 0 additions & 1 deletion json_data/a-verifier/zz_27111889_jo_debats_0013.json

This file was deleted.

1 change: 0 additions & 1 deletion json_data/a-verifier/zz_27111889_jo_debats_0016.json

This file was deleted.

1 change: 0 additions & 1 deletion json_data/a-verifier/zz_27111889_jo_debats_0017.json

This file was deleted.

1 change: 0 additions & 1 deletion json_data/a-verifier/zz_27111889_jo_debats_0018.json

This file was deleted.

1 change: 0 additions & 1 deletion json_data/a-verifier/zz_27111889_jo_debats_0019.json

This file was deleted.

This file was deleted.

This file was deleted.

5,197 changes: 0 additions & 5,197 deletions json_data/json-test-remise-ordre/json/zz_27111889_jo_debats_0001_sorted.json

This file was deleted.

This file was deleted.

4,018 changes: 0 additions & 4,018 deletions json_data/json-test-remise-ordre/json/zz_27111889_jo_debats_0002_sorted.json

This file was deleted.

This file was deleted.

4,355 changes: 0 additions & 4,355 deletions json_data/json-test-remise-ordre/json/zz_27111889_jo_debats_0007_sorted.json

This file was deleted.

This file was deleted.

5,761 changes: 0 additions & 5,761 deletions json_data/json-test-remise-ordre/json/zz_27111889_jo_debats_0016_sorted.json

This file was deleted.

This file was deleted.

4,951 changes: 0 additions & 4,951 deletions json_data/json-test-remise-ordre/json/zz_27111889_jo_debats_0017_sorted.json

This file was deleted.

1 change: 1 addition & 0 deletions json_data_raw/zz-JO-debats-14011890_0007.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions json_data_raw/zz-JO-debats-14011890_0008.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions json_data_raw/zz-JO-debats-14011890_0009.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions json_data_raw/zz-JO-debats-14011890_0010.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions json_data_raw/zz-JO-debats-14011890_0011.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions json_data_raw/zz_27111889_jo_debats_0001.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions json_data_raw/zz_27111889_jo_debats_0002.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions json_data_raw/zz_27111889_jo_debats_0003.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions json_data_raw/zz_27111889_jo_debats_0004.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions json_data_raw/zz_27111889_jo_debats_0005.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions json_data_raw/zz_27111889_jo_debats_0006.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions json_data_raw/zz_27111889_jo_debats_0007.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions json_data_raw/zz_27111889_jo_debats_0008.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions json_data_raw/zz_27111889_jo_debats_0009.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions json_data_raw/zz_27111889_jo_debats_0010.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions json_data_raw/zz_27111889_jo_debats_0011.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions json_data_raw/zz_27111889_jo_debats_0012.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions json_data_raw/zz_27111889_jo_debats_0013.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions json_data_raw/zz_27111889_jo_debats_0014.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions json_data_raw/zz_27111889_jo_debats_0016.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions json_data_raw/zz_27111889_jo_debats_0017.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions json_data_raw/zz_27111889_jo_debats_0018.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions json_data_raw/zz_27111889_jo_debats_0019.json

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import numpy as np
import sys

def sort_entries(entries, columns) :
def sort_entries(entries: List, columns: list[(int, int, int, int)], *,
skip_error: bool = True) -> int:

e_coords = np.array([e["box"] for e in entries], dtype="int")
e_center_x = e_coords[:,0] + e_coords[:,2] // 2
Expand All @@ -31,10 +32,10 @@ def sort_entries(entries, columns) :
for i, e in enumerate(entries):
k = int(keys[i])
if not bucket_has[k, i]:
err = "The entry {e} is not included in any column."
err = f"The entry {e} is not included in any column."
e["key"] = (np.inf, np.inf)
# if not skip_error:
# raise RuntimeError(err)
if not skip_error:
raise RuntimeError(err)
else:
e["key"] = (k, int(e_center_y[i]))
entries.sort(key = lambda e: e["key"])
Expand Down
70 changes: 70 additions & 0 deletions scripts remise ordre box/sort_entries2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@

import json
from typing import List
import numpy as np
import sys

def sort_entries(entries: List, columns: "List[(int, int, int, int)]", *,
skip_error: bool = True) -> int:

e_coords = np.array([e["box"] for e in entries], dtype="int")
e_center_x = e_coords[:,0] + e_coords[:,2] // 2
e_center_y = e_coords[:,1] + e_coords[:,3] // 2

buckets = np.array([e["box"] for e in columns], dtype="int")
# (x,y,w,h) => (x0,y0,x1,y1)
x0 = buckets[:,0]
y0 = buckets[:,1]
x1 = buckets[:,0] + buckets[:,2]
y1 = buckets[:,1] + buckets[:,3]

bucket_has = np.logical_and.reduce([
x0[:, np.newaxis] <= e_center_x[np.newaxis,:],
x1[:, np.newaxis] > e_center_x[np.newaxis,:],
y0[:, np.newaxis] <= e_center_y[np.newaxis,:],
y1[:, np.newaxis] > e_center_y[np.newaxis,:]
])

keys = np.argmax(bucket_has, axis=0)

# error_txt = "The box ({}) is not included in any columns"

for i, e in enumerate(entries):
k = int(keys[i])
if not bucket_has[k, i]:
err = f"The entry {e} is not included in any column."
e["key"] = (np.inf, np.inf)
if not skip_error:
raise RuntimeError(err)
else:
e["key"] = (k, int(e_center_y[i]))
entries.sort(key = lambda e: e["key"])

def process(path_in, path_out):

with open(path_in) as f:
data = json.load(f)

types_to_sort = ('ENTRY', "TITLE_LEVEL_2")

entries = [e for e in data if e["type"] in types_to_sort]
columns = [e for e in data if e["type"] == 'COLUMN_LEVEL_1']
sort_entries(entries, columns)
all = [e for e in data if e["type"] not in types_to_sort]
all.extend(entries)

with open(path_out, "w") as f:
json.dump(all, f, indent=True, ensure_ascii=False)


if len(sys.argv) != 3:
print("""
Usage: {} input.json output.json
Reorder the entries of a json file.
"""
.format(sys.argv[0]))
exit(1)

process(sys.argv[1], sys.argv[2])

149 changes: 101 additions & 48 deletions scripts/main.py
Original file line number Diff line number Diff line change
@@ -1,66 +1,119 @@
import json, os
from script_balisage_formel import add_seg, add_signed
from script_balisage_semantique import add_utterance, add_comment, add_incident, add_quote
from script_nettoyage import nettoyage_saut_ligne
import json, os, re
from lxml import etree
from script_compilation import compilation
from script_nettoyage import clean_xml
from script_metadonnees import var_metadata, build_teiheader

# Chemin vers les fichiers JSON
path_to_json = '/home/fanny/Documents/AGODA/Docs de travail AGODA Github/Transformations/json_data/'
# Chemins vers les fichiers JSON et les fichiers XML
path = os.path.dirname(__file__)
path_to_json = os.path.join(os.path.abspath(os.path.join(path, os.pardir)), "json_data")
path_to_xml = os.path.join(os.path.abspath(os.path.join(path, os.pardir)), "xml_data")

beginning_elements = '<?xml version="1.0" encoding="UTF-8"?> <?xml-model ' \
'href="agoda_schema.rng" ' \
'type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?> <?xml-model ' \
'href="agoda_schema.rng" ' \
'type="application/xml" schematypens="http://purl.oclc.org/dsdl/schematron"?> <TEI ' \
'xmlns="http://www.tei-c.org/ns/1.0" xml:lang="fr"> '
end_elements = '</TEI>'

compteur = 1
# Boucle permettant de lire chaque fichier JSON dont le nom finit par ".json" contenu dans un dossier dont le chemin
# est donné

for file_name in sorted([file for file in os.listdir(path_to_json) if file.endswith('.json')]):
with open(os.path.join(path_to_json, file_name), encoding='utf-8') as json_file:
data = json.load(json_file)

# Fonction principale permettant d'appliquer l'ensemble des définitions définies dans les autres scripts
def main(x,compteur):
"""
Expliquer fonction
:return:
"""
# définir zwt
for i in range(len(data)):
if "comment" in data[i]:
if re.search(r"body[^1]", data[i]["comment"]):
inc = 0
zwt = int(data[i]['text_ocr'].split()[-1])
elif re.search(r"page-number", data[i]["comment"]) and not re.search(r"body", data[i]["comment"]):
data[i]['text_ocr'] = "".join(['<pb n="', str(zwt + inc), '"/>'])
inc += 1
elif re.search(r"page-number-ref", data[i]["comment"]):
data[i]['text_ocr'] = "".join(['<ref target="#', str(zwt + inc), '"/>'])
inc += 1

# Application de la fonction main
date_pub, meetings, meeting_sitting, date_sitting = var_metadata(data)
header = build_teiheader(date_pub, meetings, meeting_sitting, date_sitting)

# Appeler les définitions classées par thématiques dans un ordre bien précis
add_quote(x)
add_incident(x)
add_seg(x)
add_comment(x)
add_utterance(x)
add_signed(x)
nettoyage_saut_ligne(x)
compilation(data, zwt, inc)

# Gestion des éléments obligatoires en XML TEI à ajouter (élément racine par exemple) ici ?

# Écriture du contenu de data dans les JSON - étape intermédiaire à enlever par la suite
with open('essai' + str(compteur) + '.json', 'w') as f:
json.dump(x, f)
# Création fichier .xml en mode écriture
for i in range(len(data)):
if "comment" in data[i]:
if "body" in data[i]["comment"]:

return
name = json_file
file_name1 = re.split(r"_p", file_name)[0]
output_xml = open(str(os.path.join(path_to_xml, file_name1 + ".xml")), mode="w")

output_xml.write(beginning_elements)
output_xml.write(header)

# Boucle permettant d'écrire dans le fichier .xml tous les text_ocr de data
if "text_ocr" in data[i]:
if len(data[i]["text_ocr"]) > 0:
output_xml.write(data[i]['text_ocr'])

# ajouter espace entre boxes
output_xml.write(" ")

# Boucle permettant de lire chaque fichier JSON dont le nom finit par ".json" contenu dans un dossier dont le chemin est donné
for file_name in [file for file in os.listdir(path_to_json) if file.endswith('.json')]:
with open(path_to_json + file_name) as json_file:
data = json.load(json_file)
if "comment" in data[i]:
if "text" in data[i]["comment"]:
output_xml.write(end_elements)
output_xml.close()

# Application de la fonction main
main(data, compteur)
compteur +=1
# vérification du schéma TEI --> à gérer

# Création fichier .xml en mode écriture
for i in range(len(data)):
if "comment" in data[i]:
if "body" in data[i]["comment"]:
name = json_file
output_xml = open(str(file_name)+".xml", mode="w")
# Boucle permettant d'écrire dans le fichier .xml tous les text_ocr de data
if "text_ocr" in data[i]:
if len(data[i]["text_ocr"]) > 0:
output_xml.write(data[i]['text_ocr'])

# autre méthode pour la création de fichiers .xml ?
# écrire dans un fichier pour chacune des séances --> à gérer
# nommer chaque fichier d'une certaine façon --> à gérer
# vérification du schéma TEI --> à gérer
# Nettoyage des fichiers xml

output_xml.close()
clean_xml(path_to_xml)

# ---------------------------------------------------------------------------------------------------------------------

# Ajout des xi:include dans le teiCorpus

my_tree = etree.parse(os.path.join(path_to_xml, 'FR_3R_5L.xml'))
my_root = my_tree.getroot()
namespace = 'http://www.w3.org/2001/XInclude'


for file_name in sorted([file for file in os.listdir(path_to_xml) if file.endswith('.xml')]):
if len(str(file_name)) > 12:
f = open(os.path.join(path_to_xml,'FR_3R_5L.xml'),mode = "r")
text = f.read()
if re.search(file_name,text) :
print("ok")
continue

#if not

# Ajout des xi:include en allant chercher les noms des fichiers xml

my_root.append(etree.Element(etree.QName(namespace, 'include'), nsmap={'xi':namespace},
attrib={'href':str(file_name)}))

# Sauvegarde
file_to_save = os.path.join(path_to_xml, 'FR_3R_5L.xml')
my_tree.write(file_to_save, pretty_print=True, encoding='utf-8', xml_declaration=True)

# Vérification du respect du schéma relaxNG

# # Ouverture du fichier rng
# relaxng_doc = etree.parse(path_to_xml + '/agoda_schema.rng')
#
# # Association comme étant un schéma relaxNG
# relaxng = etree.RelaxNG(relaxng_doc)
#
# # Vérification des erreurs sur l'ensemble des fichiers XML
# relaxng.assertValid(my_tree)

# ---------------------------------------------------------------------------------------------------------------------
print(my_root)
29 changes: 19 additions & 10 deletions scripts/script_balisage_formel.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
import re


def add_seg(data):
"""
Ajoute l'élément TEI "seg" pour chacun des paragraphes étiquetés "seg", "seg-beginning", "seg-end"
Ajout de l'élément TEI "seg" pour chaque boxe étiquetée "seg" ou "seg-beginning", "seg-end" à l'exception
des "seg" couplés avec l'étiquette "quote-beginning" ou "quote-end"
:return:
"""
for i in range(len(data)):
if "comment" in data[i]:
if re.search(r"seg[^-]|seg$", data[i]["comment"]) and not re.search(r"quote-beginning|quote-end", data[i]["comment"]): # expression régulière : seg pouvant être suivi de n'importe quel caractère sauf le "-", et seg en fin de ligne
if re.search(r"seg[^-]|seg$", data[i]["comment"]) and not re.search(r"quote-beginning|quote-end", data[i]["comment"]):
#expression régulière : seg pouvant être suivi de n'importe quel caractère sauf le "-", et seg en fin de ligne
#obligation de mettre "and not" car problème pour la gestion des quote-beginning/ quote-end, chevauchement entre seg et quote
#la gestion du seg pour ce cas précis est géré dans la fonction add_quote
data[i]['text_ocr'] = "".join(['<seg>', data[i]['text_ocr'], '</seg>'])
elif re.search(r"seg-beginning", data[i]["comment"]):
data[i]['text_ocr'] = "".join(['<seg>', data[i]['text_ocr']])
Expand All @@ -19,7 +24,7 @@ def add_seg(data):

def add_signed(data):
"""
Ajoute l'élément TEI "signed" pour chaque élément étiqueté "signed"
Ajout de l'élément TEI "signed" pour chaque boxe étiquetée "signed"
:return:
"""
for i in range(len(data)):
Expand All @@ -30,18 +35,22 @@ def add_signed(data):
pass
return data

# NE FONCTIONNE PAS : À REPRENDRE
def add_page_beginning(data):

def add_page_number(data, zwt, inc):
"""
Ajoute l'élément TEI "pb" lorsqu'il y a l'étiquette "page-number" ou "page-number-ref", "seg-end"
Ajout de l'élément TEI "pb" et de l'attribut @n pour chaque boxe étiquetée "page-number" et ajout de l'élément TEI
"ref" et de l'attribut @target pour chaque boxe étiquetée "page-number-ref"
:return:
"""
for i in range(len(data)):
if "comment" in data[i]:
if re.search(r"page-number[^-]|page-number$", data[i]["comment"]): # expression régulière : page-number pouvant être suivi de n'importe quel caractère sauf le "-", et page-number en fin de ligne
data[i]['text_ocr'] = "".join(['<pb n="', data[i]['text_ocr'], '"/>'])
if re.search(r"body[^1]", data[i]["comment"]):
data[i]['text_ocr'] = "".join(['<pb n="', data[i]['text_ocr'].split()[-1], '"/>'])
elif re.search(r"page-number[^-]|page-number$", data[i]["comment"]) and not re.search(r"body", data[i]["comment"]):
data[i]['text_ocr'] = "".join(['<pb n="', str(zwt + inc), '"/>'])
elif re.search(r"page-number-ref", data[i]["comment"]):
data[i]['text_ocr'] = "".join(['<ref target="#', data[i]['text_ocr'], '"/>'])
data[i]['text_ocr'] = "".join(['<ref target="#', str(zwt + inc), '"/>'])
else:
pass
return data
return data

Loading

0 comments on commit 54732dd

Please sign in to comment.