luwry
diff --git a/‎README.md
Lines changed: 44 additions & 0 deletions b/‎README.md
Lines changed: 44 additions & 0 deletions
diff --git a/‎config.py
Lines changed: 104 additions & 0 deletions b/‎config.py
Lines changed: 104 additions & 0 deletions
diff --git a/‎constants.py
Lines changed: 22 additions & 0 deletions b/‎constants.py
Lines changed: 22 additions & 0 deletions
diff --git a/‎convert_grit.py
Lines changed: 105 additions & 0 deletions b/‎convert_grit.py
Lines changed: 105 additions & 0 deletions
diff --git a/‎convert_tf.py
Lines changed: 109 additions & 0 deletions b/‎convert_tf.py
Lines changed: 109 additions & 0 deletions
@@ -1,2 +1,46 @@
 # RTG4TE
 Source code for the paper: Retrieval-enhanced Template Generation for Template Extraction (NLPCC 2024)
+
+## Overview
+
+![model11](figs/an example.png)
+An example of template extraction. A generic template is extracted for document-level REE task. Two event templates including an `Attack` event template and a `Bombing` event template are extracted for TF task.
+
+All the required packages are listed in `requirements.txt`. To install all the dependencies, run
+
+```
+pip install -r requirements.txt
+```
+
+
+## Data
+For TF task, we downloaded the original dataset from [GTT](https://github.com/xinyadu/gtt). The extracted train, dev, and test files are located in `data/tf/`. 
+These original data are transformed into our internal format using `convert_tf.py`.
+```
+python convert_tf.py --input_path data/train.json --output_path data/tf_train.json
+```
+
+AS for REE task, we downloaded the original dataset from [GRIT](https://github.com/xinyadu/grit_doc_event_entity/). The extracted train, dev, and test files are located in `data/ree/`. 
+These original data are transformed into our internal format using `convert_grit.py`.
+
+```
+python convert_grit.py --input_path data/grit_train.json --output_path data/ree_train.json
+```
+
+## Usage
+Template Filling
+```
+python train.py -c config/tf_generative_model.json
+```
+
+Role-filler entity extraction
+```
+python train.py -c config/ree_generative_model.json
+```
+
+## Acknowledgement
+
+We refer to the code of [TempGen](https://github.com/PlusLabNLP/TempGen). Thanks for their contributions.
+## Citation
+
+
@@ -0,0 +1,104 @@
+import copy
+import json
+import os
+from constants import *
+
+from transformers import AutoConfig
+
+class Config(object):
+    def __init__(self, **kwargs):
+        self.coref = kwargs.pop('coref', False)
+        # bert
+        self.bert_model_name = kwargs.pop('bert_model_name', 'bert-large-cased')
+        self.bert_cache_dir = kwargs.pop('bert_cache_dir', None)
+        self.extra_bert = kwargs.pop('extra_bert', -1)
+        self.use_extra_bert = kwargs.pop('use_extra_bert', False)
+        # model
+        # self.multi_piece_strategy = kwargs.pop('multi_piece_strategy', 'first')
+        self.bert_dropout = kwargs.pop('bert_dropout', .5)
+        self.linear_dropout = kwargs.pop('linear_dropout', .4)
+        self.linear_bias = kwargs.pop('linear_bias', True)
+        self.linear_activation = kwargs.pop('linear_activation', 'relu')
+        
+        # decoding
+        self.max_position_embeddings = kwargs.pop('max_position_embeddings', 2048)
+        self.num_beams = kwargs.pop('num_beams', 4)
+        self.decoding_method = kwargs.pop('decoding_method', "greedy")
+
+        # files
+        self.train_file = kwargs.pop('train_file', None)
+        self.dev_file = kwargs.pop('dev_file', None)
+        self.test_file = kwargs.pop('test_file', None)
+        self.valid_pattern_path = kwargs.pop('valid_pattern_path', None)
+        self.log_path = kwargs.pop('log_path', './log')
+        self.output_path = kwargs.pop('output_path', './output')
+        self.grit_dev_file = kwargs.pop('grit_dev_file', None)
+        self.grit_test_file = kwargs.pop('grit_test_file', None)
+
+        # training
+        self.accumulate_step = kwargs.pop('accumulate_step', 1)
+        self.batch_size = kwargs.pop('batch_size', 10)
+        self.eval_batch_size = kwargs.pop('eval_batch_size', 5)
+        self.max_epoch = kwargs.pop('max_epoch', 50)
+        self.max_length = kwargs.pop('max_length', 128)
+        self.learning_rate = kwargs.pop('learning_rate', 1e-3)
+        self.bert_learning_rate = kwargs.pop('bert_learning_rate', 1e-5)
+        self.weight_decay = kwargs.pop('weight_decay', 0.001)
+        self.bert_weight_decay = kwargs.pop('bert_weight_decay', 0.00001)
+        self.warmup_epoch = kwargs.pop('warmup_epoch', 5)
+        self.grad_clipping = kwargs.pop('grad_clipping', 5.0)
+        self.SOT_weights = kwargs.pop('SOT_weights', 100)
+        self.permute_slots = kwargs.pop('permute_slots', False)
+        
+   # task cannot be empty
+
+        # others
+        self.use_gpu = kwargs.pop('use_gpu', True)
+        self.gpu_device = kwargs.pop('gpu_device', 0)
+        self.seed = kwargs.pop('seed', 0)
+        # self.seed = kwargs.pop('seed', 1)
+        self.use_copy = kwargs.pop('use_copy', False)
+        self.use_SAGCopy = kwargs.pop('use_SAGCopy', False)
+        self.k = kwargs.pop('k', 12)
+        
+
+
+    @classmethod
+    def from_dict(cls, dict_obj):
+        """Creates a Config object from a dictionary.
+        Args:
+            dict_obj (Dict[str, Any]): a dict where keys are
+        """
+        config = cls()
+        for k, v in dict_obj.items():
+            setattr(config, k, v)
+        return config
+
+    @classmethod
+    def from_json_file(cls, path):
+        with open(path, 'r', encoding='utf-8') as r:
+            return cls.from_dict(json.load(r))
+
+    def to_dict(self):
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def save_config(self, path):
+        """Save a configuration object to a file.
+        :param path (str): path to the output file or its parent directory.
+        """
+        if os.path.isdir(path):
+            path = os.path.join(path, 'config.json')
+        print('Save config to {}'.format(path))
+        with open(path, 'w', encoding='utf-8') as w:
+            w.write(json.dumps(self.to_dict(), indent=2,
+                               sort_keys=True))
+    @property
+    def bert_config(self):
+        
+        
+        return AutoConfig.from_pretrained(self.bert_model_name,
+                                                    cache_dir=self.bert_cache_dir,
+                                                    max_position_embeddings=self.max_position_embeddings)
+        
+            
@@ -0,0 +1,22 @@
+SEP_T = '<SEP_T>'
+SEP = '<SEP>'
+END_OF_SEP = '</SEP>'
+PERP_IND='<PerpInd>'
+END_OF_PERP_IND='</PerpInd>'
+PERP_ORG='<PerpOrg>'
+END_OF_PERP_ORG='</PerpOrg>'
+TARGET='<Target>'
+END_OF_TARGET='</Target>'
+VICTIM='<Victim>'
+END_OF_VICTIM='</Victim>'
+WEAPON='<Weapon>'
+END_OF_WEAPON='</Weapon>'
+AND = "[and]"
+NO_ROLE = "[None]"
+
+ROLES = [SEP_T, SEP, END_OF_SEP, PERP_IND, END_OF_PERP_IND, PERP_ORG, END_OF_PERP_ORG, TARGET, END_OF_TARGET, VICTIM, END_OF_VICTIM, WEAPON, END_OF_WEAPON, AND, NO_ROLE]
+# these variables are for decoding
+SLOT_NAME_TAG=0
+ENTITY_TAG=1
+
+ROLE_FILLER_ENTITY_EXTRACTION='ree'
@@ -0,0 +1,105 @@
+import argparse
+import json
+import nltk
+# these are for splitting doctext to sentences 
+nltk.download('punkt')
+sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
+
+def process_entities(entities):
+
+    '''
+    [   
+        [
+            ['guerrillas', 37], 
+            ['guerrilla column', 349]
+        ],
+        [
+            ['apple', 45]
+        ],
+        [
+            ['banana', 60]
+        ]
+    ]
+    -> [['guerrillas, guerrilla column'], ['apple'], ['banana']]
+    '''
+
+    res = []
+    for entity in entities:
+
+        # take only the string 
+        res.append([mention[0] for mention in entity])     
+
+    return res
+
+def convert(doc, capitalize=False):
+    '''
+    doc: a dictionary that has the following format:
+
+    {'docid': 'TST1-MUC3-0001',
+    'doctext': 'the guatemala army denied today that guerrillas attacked the "santo tomas" presidential farm, located on the pacific side, where president cerezo has been staying since 2 february.    a report published by the "cerigua" news agency -- mouthpiece of the guatemalan national revolutionary unity (urng) -- whose main offices are in mexico, says that a guerrilla column attacked the farm 2 days ago.    however, armed forces spokesman colonel luis arturo isaacs said that the attack, which resulted in the death of a civilian who was passing by at the time of the skirmish, was not against the farm, and that president cerezo is safe and sound.    he added that on 3 february president cerezo met with the diplomatic corps accredited in guatemala.    the government also issued a communique describing the rebel report as "false and incorrect," and stressing that the president was never in danger.    col isaacs said that the guerrillas attacked the "la eminencia" farm located near the "santo tomas" farm, where they burned the facilities and stole food.    a military patrol clashed with a rebel column and inflicted three casualties, which were taken away by the guerrillas who fled to the mountains, isaacs noted.    he also reported that guerrillas killed a peasant in the city of flores, in the northern el peten department, and burned a tank truck.',
+    'extracts': {'PerpInd': [[['guerrillas', 37], ['guerrilla column', 349]]],
+    'PerpOrg': [[['guatemalan national revolutionary unity', 253],
+        ['urng', 294]]],
+    'Target': [[['"santo tomas" presidential farm', 61],
+        ['presidential farm', 75]],
+    [['farm', 88], ['"la eminencia" farm', 947]],
+    [['facilities', 1026]],
+    [['tank truck', 1341], ['truck', 1346]]],
+    'Victim': [[['cerezo', 139]]],
+    'Weapon': []}}
+
+    capitalize: whether to capitalize doctext or not
+    '''
+
+    res = {
+        'docid': doc['docid'], 
+        'document': doc['doctext'], # the raw text document.
+        'annotation': [] # A list of templates. In role-filler entity extraction, we only have one template for each don't care about this.       
+    }
+
+    if capitalize:
+        # split doctext into sentences
+        sentences = sent_tokenizer.tokenize(doc['doctext'])
+        capitalized_doctext = ' '.join([sent.capitalize() for sent in sentences])
+        res['document'] = capitalized_doctext
+
+
+
+    # TODO: add "tags" in the document
+    # res['document'] = doc_text_no_n
+
+    annotation = doc['extracts']
+    for role, entities in annotation.items():
+        # make sure entities is not an empty list
+        if entities:
+            # make sure res['annotation'] has one dictionary
+            if len(res['annotation']) == 0:
+                res['annotation'].append({})
+            res['annotation'][0][role] = process_entities(entities)
+
+    return res
+
+if __name__ == '__main__':
+    
+    p = argparse.ArgumentParser("Convert GRIT input data into ours format.")
+    
+    p.add_argument('--input_path', type=str, help="input file in GRIT format.")
+    p.add_argument('--output_path',type=str, help="path to store the output json file.")
+    p.add_argument('--capitalize',action="store_true", help="whether to capitalize the first char of each sentence")
+    args = p.parse_args()
+
+    with open(args.input_path, 'r') as f:
+        grit_inputs = [json.loads(l) for l in f.readlines()]
+
+    all_processed_doc = dict()
+
+    # iterate thru and process all grit documents 
+    for grit_doc in grit_inputs:
+        
+        processed = convert(grit_doc, args.capitalize)
+        doc_id = processed.pop('docid')
+        if processed['annotation']:
+            all_processed_doc[doc_id] = processed
+    
+    with open(args.output_path, 'w') as f:
+        f.write(json.dumps(all_processed_doc))
@@ -0,0 +1,109 @@
+import argparse
+import json
+import nltk
+# these are for splitting doctext to sentences 
+nltk.download('punkt')
+sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
+
+def process_entities(entities):
+
+    '''
+    [
+        [
+            ['terrorists', 102]
+        ]
+    ]
+    ->[['terrorists']]
+
+    [
+        [
+            ['farabundo marti national liberation front', 120],
+            ['fmln', 163]
+        ]
+    ]
+    ->[['farabundo marti national liberation front', 'fmln']]
+
+    '''
+
+    res = []
+    for entity in entities:
+
+        # take only the string 
+        res.append([mention[0] for mention in entity])     
+
+    return res
+
+def convert(doc, capitalize=False):
+    '''
+    doc: a dictionary that has the following format:
+    {'docid': 'DEV-MUC3-0001',
+    'doctext': "the arce battalion command has reported that about 50 peasants of various ages have been kidnapped by terrorists of the farabundo marti national liberation front (fmln) in san miguel department.  according to that garrison, the mass kidnapping took place on 30 december in san luis de la reina.  the source added that the terrorists forced the individuals, who were taken to an unknown location, out of their residences, presumably to incorporate them against their will into clandestine groups.    meanwhile, three subversives were killed and seven others were wounded during clashes yesterday in usulutan and morazan departments.  the atonal battalion reported that one extremist was killed and five others were wounded during a clash yesterday afternoon near la esperanza farm, santa elena jurisdiction, usulutan department.    it was also reported that a soldier was wounded and taken to the military hospital in this capital.    the same military unit reported that there was another clash that resulted in one dead terrorist and the seizure of various kinds of war materiel near san rafael farm in the same town.    in the country's eastern region, military detachment no.4 reported that a terrorist was killed and two others were wounded during a clash in la ranera stream, san carlos, morazan department.  an m-16 rifle, cartridge clips, and ammunition were seized there.    meanwhile, the 3d infantry brigade reported that ponce battalion units found the decomposed body of a subversive in la finca hill, san miguel.  an m-16 rifle, five grenades, and material for the production of explosives were found in the same place.  the brigade, which is headquartered in san miguel, added that the seizure was made yesterday morning.     national guard units guarding the las canas bridge, which is on the northern trunk highway in apopa, this morning repelled a terrorist attack that resulted in no casualties.  the armed clash involved mortar and rifle fire and lasted 30 minutes.  members of that security group are combing the area to determine the final outcome of the fighting.",
+    'templates': [{'incident_type': 'kidnapping',
+    'PerpInd': [[['terrorists', 102]]],
+    'PerpOrg': [[['farabundo marti national liberation front', 120], ['fmln', 163]]],
+    'Target': [], 'Victim': [], 'Weapon': []},
+    {'incident_type': 'attack',
+    'PerpInd': [[['terrorist', 102]]],
+    'PerpOrg': [],
+    'Target': [[['las canas bridge', 1774]]],
+    'Victim': [],
+    'Weapon': [[['rifle', 1322]], [['mortar', 1940]]]}]}
+
+    capitalize: whether to capitalize doctext or not
+    '''
+
+    res = {
+        'docid': doc['docid'], 
+        'document': doc['doctext'], # the raw text document.
+        'annotation': [] # A list of templates. In role-filler entity extraction, we only have one template for each don't care about this.       
+    }
+
+    if capitalize:
+        # split doctext into sentences
+        sentences = sent_tokenizer.tokenize(doc['doctext'])
+        capitalized_doctext = ' '.join([sent.capitalize() for sent in sentences])
+        res['document'] = capitalized_doctext
+
+
+
+    # TODO: add "tags" in the document
+    # res['document'] = doc_text_no_n
+
+    annotation = doc['templates']
+    for template in annotation:
+        template_dic = {}
+        for role, entities in template.items():
+            # make sure entities is not an empty list
+            if entities:
+                # make sure res['annotation'] has one dictionary
+                if role == "incident_type":
+                    template_dic[role] = entities
+                else:
+                    template_dic[role] = process_entities(entities)
+        if template_dic['incident_type'] in ['kidnapping', 'attack', 'bombing', "arson", 'robbery']:
+            res['annotation'].append(template_dic)
+    return res
+
+if __name__ == '__main__':
+    
+    p = argparse.ArgumentParser("Convert GRIT input data into ours format.")
+    
+    p.add_argument('--input_path', default="./data/train.json", type=str, help="input file in GRIT format.")
+    p.add_argument('--output_path', default="./data/tf_train.json", type=str, help="path to store the output json file.")
+    p.add_argument('--capitalize',action="store_true", help="whether to capitalize the first char of each sentence")
+    args = p.parse_args()
+
+    with open(args.input_path, 'r') as f:
+        grit_inputs = [json.loads(l) for l in f.readlines()]
+
+    all_processed_doc = dict()
+
+    # iterate thru and process all grit documents 
+    for grit_doc in grit_inputs:
+        
+        processed = convert(grit_doc, args.capitalize)
+        doc_id = processed.pop('docid')
+        all_processed_doc[doc_id] = processed
+    
+    with open(args.output_path, 'w') as f:
+        f.write(json.dumps(all_processed_doc))