Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add change_names_country_specific method #122

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 120 additions & 37 deletions checklist/perturb.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import json
import pattern
from pattern.en import tenses
from .editor import recursive_apply, MunchWithAdd
from .editor import recursive_apply, MunchWithAdd, Editor

def load_data():
cur_folder = os.path.dirname(__file__)
Expand All @@ -18,6 +18,29 @@ def load_data():
'city': basic['city'],
'country': basic['country'],
}

# enhance data['name'] that contains {'male' : [], 'female' : []} with new keys on the country of origin
editor = Editor()
# create a set of countries that have male/female and last names
list_target_countries = []
for target_country in editor.lexicons.male_from.keys():
if target_country in editor.lexicons.female_from.keys():
if target_country in editor.lexicons.last_from.keys():
list_target_countries.append(target_country)
data['name'][target_country] = {}

for target_country, v in editor.lexicons.male_from.items():
if target_country in list_target_countries:
data['name'][target_country]['male'] = v

for target_country, v in editor.lexicons.female_from.items():
if target_country in list_target_countries:
data['name'][target_country]['female'] = v

for target_country, v in editor.lexicons.last_from.items():
if target_country in list_target_countries:
data['name'][target_country]['last'] = {'male': v, 'female' : v}

return data

def process_ret(ret, ret_m=None, meta=False, n=10):
Expand All @@ -38,7 +61,6 @@ class Perturb:
@staticmethod
def perturb(data, perturb_fn, keep_original=True, nsamples=None, *args, **kwargs):
"""Perturbs data according to some function

Parameters
----------
data : list
Expand All @@ -53,12 +75,10 @@ def perturb(data, perturb_fn, keep_original=True, nsamples=None, *args, **kwargs
number of examples in data to perturb
meta : bool
if True, perturb_fn returns (examples, meta), and meta is added to ret.meta

Returns
-------
MunchWithAdd
will have .data and .meta (if meta=True in **kwargs)

"""
ret = MunchWithAdd()
use_meta = kwargs.get('meta', False)
Expand All @@ -80,6 +100,8 @@ def perturb(data, perturb_fn, keep_original=True, nsamples=None, *args, **kwargs
a = []
x = []
if not p or all([not x for x in p]):
ret_data.append(t)
meta.append(add)
continue
if use_meta:
p, a = p
Expand All @@ -102,17 +124,14 @@ def perturb(data, perturb_fn, keep_original=True, nsamples=None, *args, **kwargs
@staticmethod
def strip_punctuation(doc):
"""Removes punctuation

Parameters
----------
doc : spacy.tokens.Doc
spacy doc

Returns
-------
string
With punctuation stripped

"""
# doc is a spacy doc
while len(doc) and doc[-1].pos_ == 'PUNCT':
Expand All @@ -122,17 +141,14 @@ def strip_punctuation(doc):
@staticmethod
def punctuation(doc):
"""Perturbation function which adds / removes punctuations

Parameters
----------
doc : spacy.tokens.Doc
spacy doc

Returns
-------
list(string)
With punctuation removed and / or final stop added.

"""
# doc is a spacy doc
s = Perturb.strip_punctuation(doc)
Expand All @@ -147,19 +163,16 @@ def punctuation(doc):
@staticmethod
def add_typos(string, typos=1):
"""Perturbation functions, swaps random characters with their neighbors

Parameters
----------
string : str
input string
typos : int
number of typos to add

Returns
-------
list(string)
perturbed strings

"""
string = list(string)
swaps = np.random.choice(len(string) - 1, typos)
Expand All @@ -173,17 +186,14 @@ def add_typos(string, typos=1):
def remove_negation(doc):
"""Removes negation from doc.
This is experimental, may or may not work.

Parameters
----------
doc : spacy.token.Doc
input

Returns
-------
string
With all negations removed

"""
# This removes all negations in the doc. I should maybe add an option to remove just some.
notzs = [i for i, z in enumerate(doc) if z.lemma_ == 'not' or z.dep_ == 'neg']
Expand Down Expand Up @@ -243,17 +253,14 @@ def remove_negation(doc):
def add_negation(doc):
"""Adds negation to doc
This is experimental, may or may not work. It also only works for specific parses.

Parameters
----------
doc : spacy.token.Doc
input

Returns
-------
string
With negations added

"""
for sentence in doc.sents:
if len(sentence) < 3:
Expand Down Expand Up @@ -317,35 +324,29 @@ def add_negation(doc):
@staticmethod
def contractions(sentence, **kwargs):
"""Perturbation functions, contracts and expands contractions if present

Parameters
----------
sentence : str
input

Returns
-------
list
List of strings with contractions expanded or contracted, or []

"""
expanded = [Perturb.expand_contractions(sentence), Perturb.contract(sentence)]
return [t for t in expanded if t != sentence]

@staticmethod
def expand_contractions(sentence, **kwargs):
"""Expands contractions in a sentence (if any)

Parameters
----------
sentence : str
input string

Returns
-------
string
String with contractions expanded (if any)

"""
contraction_map = {
"ain't": "is not", "aren't": "are not", "can't": "cannot",
Expand Down Expand Up @@ -392,17 +393,14 @@ def expand_match(contraction):
@staticmethod
def contract(sentence, **kwargs):
"""Contract expanded contractions in a sentence (if any)

Parameters
----------
sentence : str
input string

Returns
-------
string
String with contractions contracted (if any)

"""
reverse_contraction_map = {
'is not': "isn't", 'are not': "aren't", 'cannot': "can't",
Expand Down Expand Up @@ -439,7 +437,6 @@ def cont(possible):
@staticmethod
def change_names(doc, meta=False, n=10, first_only=False, last_only=False, seed=None):
"""Replace names with other names

Parameters
----------
doc : spacy.token.Doc
Expand All @@ -454,13 +451,11 @@ def change_names(doc, meta=False, n=10, first_only=False, last_only=False, seed=
if True, will only replace last names
seed : int
random seed

Returns
-------
list(str)
if meta=True, returns (list(str), list(tuple))
Strings with names replaced.

"""
if seed is not None:
np.random.seed(seed)
Expand Down Expand Up @@ -499,10 +494,103 @@ def change_names(doc, meta=False, n=10, first_only=False, last_only=False, seed=
ret_m.append((f, y))
return process_ret(ret, ret_m=ret_m, n=n, meta=meta)

@staticmethod
def change_names_country_specific(target_country, target_gender):
"""
Change the entities regarding a target country or a target gender
Parameters
----------
target_country : str
input
target_gender : str
if True, will return list of (orig_name, new_name) as meta
Returns
-------
Function
"""
def __change_names_country_specific(doc, meta=False, n=10, first_only=False, last_only=False, seed=None):
return Perturb._change_names_country_specific(doc, target_country=target_country, target_gender=target_gender, meta=meta, n=n, first_only=first_only, last_only=last_only, seed=seed)

return __change_names_country_specific

@staticmethod
def _change_names_country_specific(doc, target_country='France', target_gender='male', meta=False, n=10, first_only=False, last_only=False, seed=None):
"""Replace names with other names specific from a target country and a target gender
Parameters
----------
doc : spacy.token.Doc
input
target_country : str
Target country to use when changing the name
target_gender : str
Target gender to use when changing the name
meta : bool
if True, will return list of (orig_name, new_name) as meta
n : int
number of names to replace original names with
first_only : bool
if True, will only replace first names
last_only : bool
if True, will only replace last names
seed : int
random seed
Returns
-------
list(str)
if meta=True, returns (list(str), list(tuple))
Strings with names replaced.
"""
if seed is not None:
np.random.seed(seed)
ents = [x.text for x in doc.ents if np.all([a.ent_type_ == 'PERSON' for a in x])]
ret = []
ret_m = []
for x in ents:
f = x.split()[0]

# Finding the sex of the entity ; if no sex found then do nothing
sex = None
if f.capitalize() in Perturb.data['name_set']['women']:
sex = 'women'
if f.capitalize() in Perturb.data['name_set']['men']:
sex = 'men'
if not sex:
continue

# If firstname and lastname
if len(x.split()) > 1:
l = x.split()[1]
# If the last name is not in the list, then discard it
if len(l) > 2 and l.capitalize() not in Perturb.data['name_set']['last']:
continue

# if a single name only (can be first of last)
else:
# impossible to know if it's a firstname or a lastname ; Macron or Emmanuel
if last_only:
return None
names = Perturb.data['name'][target_country][target_gender]
to_use = np.random.choice(names, n)
if not first_only:
f = x
# if the entity is multi-words
if len(x.split()) > 1:
last = Perturb.data['name'][target_country]['last'][target_gender]
last = np.random.choice(last, n)
to_use = ['%s %s' % (x, y) for x, y in zip(to_use, last)]
if last_only:
to_use = last
f = x.split()[1]

for y in to_use:
ret.append(re.sub(r'\b%s\b' % re.escape(f), y, doc.text))
ret_m.append((f, y))

return process_ret(ret, ret_m=ret_m, n=n, meta=meta)

@staticmethod
def change_location(doc, meta=False, seed=None, n=10):
"""Change city and country names

Parameters
----------
doc : spacy.token.Doc
Expand All @@ -513,13 +601,11 @@ def change_location(doc, meta=False, seed=None, n=10):
random seed
n : int
number of locations to replace original locations with

Returns
-------
list(str)
if meta=True, returns (list(str), list(tuple))
Strings with locations replaced.

"""
if seed is not None:
np.random.seed(seed)
Expand All @@ -543,7 +629,6 @@ def change_location(doc, meta=False, seed=None, n=10):
def change_number(doc, meta=False, seed=None, n=10):
"""Change integers to other integers within 20% of the original integer
Does not change '2' or '4' to avoid abbreviations (this is 4 you, etc)

Parameters
----------
doc : spacy.token.Doc
Expand All @@ -554,13 +639,11 @@ def change_number(doc, meta=False, seed=None, n=10):
random seed
n : int
number of numbers to replace original locations with

Returns
-------
list(str)
if meta=True, returns (list(str), list(tuple))
Strings with numbers replaced.

"""
if seed is not None:
np.random.seed(seed)
Expand Down