-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpca_table_of_words.py
109 lines (83 loc) · 3.69 KB
/
pca_table_of_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from typing import List, Dict, Optional
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from src import config
plt.rcParams['font.family'] = 'monospace' # each character in this font is equally wide
def make_principal_comps_table_fig(embeddings: np.ndarray,
words: List[str],
component: int,
num_rows=4,
w2f: Optional[Dict[str, int]] = None,
fontsize: Optional[int] = None,
) -> plt.Figure:
"""
Returns fig showing table of words loading highest and lowest on a principal component
"""
assert np.ndim(embeddings) == 2 # (words, embedding size)
assert len(embeddings) == len(words)
if w2f is None:
w2f = {w: 'n/a' for w in words}
num_cols = 2
col_labels = ['Low End', 'High End']
# do PCA
num_components = component + 1
pca_model = PCA(n_components=num_components)
transformation = pca_model.fit_transform(embeddings)
explained_var_percent = np.asarray(pca_model.explained_variance_ratio_) * 100
pc = transformation[:, component]
# sort and filter
sorted_pc, sorted_words = list(zip(*sorted(zip(pc, words), key=lambda i: i[0])))
col0_strings = [f'{w:<12} {loading:-2.2f} (freq: {w2f[w]})'
for w, loading in zip(sorted_words[:num_rows], sorted_pc[:num_rows])]
col1_strings = [f'{w:<12} {loading:-2.2f} (freq: {w2f[w]})'
for w, loading in zip(sorted_words[-num_rows:][::-1], sorted_pc[-num_rows:][::-1])]
# make matrix containing text
max_rows = max(len(col0_strings), len(col1_strings))
text_mat = np.chararray((max_rows, num_cols), itemsize=60, unicode=True)
text_mat[:] = '' # initialize so that mpl can read table
text_mat[:len(col0_strings), 0] = col0_strings
text_mat[:len(col1_strings), 1] = col1_strings
# fig
res, ax = plt.subplots(figsize=config.Fig.fig_size, dpi=config.Fig.dpi)
ax.set_title('Principal Component {} ({:.2f}% var)'.format(
component, explained_var_percent[component]), fontsize=config.Fig.ax_label_fontsize)
ax.axis('off')
# plot table
table_ = ax.table(cellText=text_mat, colLabels=col_labels, loc='center')
if fontsize is not None:
table_.auto_set_font_size(False)
table_.set_fontsize(fontsize)
res.tight_layout()
return res
def make_loadings_line_fig(embeddings: np.ndarray,
component: int,
) -> plt.Figure:
"""
Returns fig showing line plot of loadings on a specified principal component
"""
# do PCA
num_components = component + 1
pca_model = PCA(n_components=num_components)
transformation = pca_model.fit_transform(embeddings)
pc = transformation[:, component]
# fig
res, ax = plt.subplots(figsize=config.Fig.fig_size, dpi=config.Fig.dpi)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
# plot
ax.plot(sorted(pc), '-', linewidth=config.Fig.line_width, color='black')
ax.set_xticklabels([])
ax.set_xlabel('Words', fontsize=config.Fig.ax_label_fontsize)
ax.set_ylabel(f'Principal Component {component} Loading', fontsize=config.Fig.ax_label_fontsize)
res.tight_layout()
return res
NUM_WORDS = 12
EMBED_SIZE = 8
# create random words and random embeddings
words = [f'word-{n}' for n in range(NUM_WORDS)]
embeddings = np.random.random((NUM_WORDS, EMBED_SIZE))
fig = make_principal_comps_table_fig(embeddings, words, component=0, num_rows=6)
fig.show()
fig = make_loadings_line_fig(embeddings, component=0)
fig.show()