-
Notifications
You must be signed in to change notification settings - Fork 158
/
Copy pathgender_by_name.sql
152 lines (146 loc) · 7.95 KB
/
gender_by_name.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
create or replace function depers.gender_by_name(
full_name text, -- ФИО, где фамилия имя и отчество разделяются пробелами и могут следовать в любом порядке
-- или Ф\nИ\nО с переносами строк (порядок следования Ф, И, О важен) улучшит качество разпознавания
is_strict boolean default false -- для неоднозначных ситуаций не учитывает веса и всегда возвращает unknown
) returns depers.gender
immutable
strict -- returns null if any parameter is null
parallel safe -- Postgres 10 or later
language sql
set search_path = ''
as
$func$
with enter_sentence as (
select lower((regexp_matches(t.phrase,
$$
#выделяем слова из текста, отделяем прилипшие друг к другу
[A-Z](?:[a-z]+|\.) #En
| [А-ЯЁ](?:[а-яё]+|\.) #Ru
| [A-Z]+ #EN
| [А-ЯЁ]+ #RU
| [a-z]+ #en
| [а-яё]+ #ru
$$, 'gx'))[1]) as word,
(array['L', 'F', 'M'])[t.position] as type -- L - lastname, F - firstname, M - middlename
from unnest(string_to_array(gender_by_name.full_name, e'\n')) with ordinality t(phrase, position)
where array_length(regexp_split_to_array(gender_by_name.full_name, '\n\s*'), 1) = 3
--where array_length(string_to_array(gender_by_name.full_name, e'\n'), 1) = 3 --TODO speed improvement
)
, enter_sentence2 as (
select distinct on (es.word) es.*
from enter_sentence as es
order by es.word, es.type --дедупликация слов
)
--select * from enter_sentence2; --отладка
, sentence as (
select lower((regexp_matches(t[1], '[a-zа-яё]+', 'ig'))[1]) as word,
(array['L', 'F', 'M'])[row_number() over ()] as type -- L - lastname, F - firstname, M - middlename
from regexp_matches(gender_by_name.full_name,
$$
#выделяем слова из текста, учитываем слова через дефис и в скобках, отделяем прилипшие друг к другу
[A-Z](?:[a-z]+ (?:- [A-Z][a-z]+)*
(?:\s*\(\s*[A-Z][a-z]+\s*\))*
|\.
) #En
| [А-ЯЁ](?:[а-яё]+ (?:- [А-ЯЁ][а-яё]+)*
(?:\s*\(\s*[А-ЯЁ][а-яё]+\s*\))*
|\.
) #Ru
| [A-Z]+ (?:- [A-Z]+)*
(?:\s*\(\s*[A-Z]+\s*\))* #EN
| [А-ЯЁ]+ (?:- [А-ЯЁ]+)*
(?:\s*\(\s*[А-ЯЁ]+\s*\))* #RU
| [a-z]+ (?:- [a-z]+)*
(?:\s*\(\s*[a-z]+\s*\))* #en
| [а-яё]+ (?:- [а-яё]+)*
(?:\s*\(\s*[а-яё]+\s*\))* #ru
$$, 'gx') as t
)
, sentence2 as (
select distinct on (s.word) s.*
from sentence as s
order by s.word, s.type --дедупликация слов
)
--select * from sentence2; --отладка
, found as (
-- проверка имён
select distinct on (s.word)
d.gender, s.word, 'F' as found_type, es.type as enter_type,
-- используем популярность имён, чтобы корректно определялся пол для ФИО типа "величко ольга", "ким александр", "герман анна"
-- по словарю величко - мужское имя, а ольга - женское, но в данном ФИО величко - это фамилия
-- т.к. имя находится по полному совпадению, то вес имени выше, чем у фамилии и отчества
1 + coalesce(d.popularity, 0) as weight
from sentence2 as s
join depers.person_name_dictionary as d
on d.gender is not null -- пропускаем неоднозначные имена типа "саша"
and s.word in (lower(d.name), lower(d.name_translit))
left join enter_sentence2 as es on es.word = s.word
union all
--проверка фамилий
select distinct on (s.word)
d.gender, s.word, 'L' as found_type, es.type as enter_type,
1 as weight
from sentence2 as s
join depers.gender_by_ending as d
on d.gender is not null
and d.name_type = 'last_name'
and length(s.word) > length(d.ending)
and lower(right(s.word, length(d.ending))) in (lower(d.ending), lower(d.ending_translit))
left join enter_sentence2 as es on es.word = s.word
union all
--проверка отчеств
select distinct on (s.word)
d.gender, s.word, 'M' as found_type, es.type as enter_type,
1 as weight
from sentence2 as s
join depers.gender_by_ending as d
on d.gender is not null
and d.name_type = 'middle_name'
and lower(right(s.word, length(d.ending))) in (lower(d.ending), lower(d.ending_translit))
left join enter_sentence2 as es on es.word = s.word
)
--select * from found; -- отладка
, found1 as (
select distinct on (f.gender, f.word) f.* --e'кызы\nэркин\nайпери' (эркин находится в имени и фамилии мужского пола)
from found as f
order by f.gender, f.word, f.weight desc
)
, found2 as (
-- корректировка весов для e'си-ян-пин\nелена\n' и e'саид\nалина\nакбари'
select max(f.gender) as gender,
array_to_string(array_agg(f.word order by f.word), ' ') as word,
max(f.found_type) as found_type,
max(f.enter_type) as enter_type,
sum(f.weight) - count(*) + 1 as weight
from found1 as f
group by f.gender, f.found_type--, enter_type
)
--select * from found2; -- отладка
, stat as (
--пользователи путают Ф,И,О местами и надеяться только на позицию нельзя!
select sum((f.gender = 'male')::int * f.weight)
+ (count(distinct f.word) != count(f.word))::int -- решение об увеличении веса на основе позиции
* sum((f.gender = 'male' and f.found_type = coalesce(f.enter_type, '*'))::int) -- тест: e'холин\nникита\n'
as male_weight,
sum((f.gender = 'female')::int * f.weight)
+ (count(distinct f.word) != count(f.word))::int -- решение об увеличении веса на основе позиции
* sum((f.gender = 'female' and f.found_type = coalesce(f.enter_type, '*'))::int)
as female_weight
from found2 as f
-- игнорируем ФИО разных людей типа 'алексей иванович светлана николаевна' или 'калинина марина сергей иванов'
where not(select count(distinct f.gender) filter (where f.found_type = 'F') = 2
and 2 in (count(distinct f.gender) filter (where f.found_type = 'M'),
count(distinct f.gender) filter (where f.found_type = 'L'))
from found2 as f)
)
--select * from stat; -- отладка
select case when gender_by_name.is_strict and s.male_weight > 0 and s.female_weight > 0 then 'unknown'
--ФИО от нескольких разных людей не должны определяться
when s.male_weight > 0 and s.female_weight > 0
and gender_by_name.full_name ~* '([,/\\;+]|\m(и|или|семья)\M)|[а-я](ины|[оеё]вы|[цс]кие|[внтлр]ые|[кчн]ие)\M' then 'unknown'
when s.male_weight - s.female_weight > 0 then 'male'
when s.male_weight - s.female_weight < 0 then 'female'
else 'unknown'
end::depers.gender as gender
from stat as s;
$func$;