Skip to content

Commit fbf994d

Browse files
committed
updated documentation
1 parent 19da017 commit fbf994d

8 files changed

+100
-124
lines changed

convokit/expected_context_framework/col_normed_tfidf.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,7 @@ def fit(self, corpus, y=None, selector=lambda x: True):
4848

4949
def transform(self, corpus, selector=lambda x: True):
5050
"""
51-
Computes column-normalized tf-idf representations for utterances in a corpus, stored in the corpus as `<output_field>`. Also annotates each utterance with a metadata field,
52-
`<output_field>__n_feats`, indicating the number of terms in the vocabulary that utterance contains.
51+
Computes column-normalized tf-idf representations for utterances in a corpus, stored in the corpus as `<output_field>`. Also annotates each utterance with a metadata field, `<output_field>__n_feats`, indicating the number of terms in the vocabulary that utterance contains.
5352
5453
5554
:param corpus: Corpus
@@ -119,7 +118,7 @@ class ColNormedTfidf(TransformerMixin):
119118

120119
"""
121120
Model that derives tf-idf reweighted representations of utterances,
122-
which are normalized by column. Can be used in ConvoKit through the `ColNormedTfidfWrapper` transformer; see documentation of that transformer for further details.
121+
which are normalized by column. Can be used in ConvoKit through the `ColNormedTfidfTransformer` transformer; see documentation of that transformer for further details.
123122
"""
124123

125124
def __init__(self, **kwargs):

convokit/expected_context_framework/dual_context_wrapper.py

Lines changed: 25 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -7,39 +7,31 @@
77
from convokit.transformer import Transformer
88

99
class DualContextWrapper(Transformer):
10+
"""
11+
Transformer that derives and compares characterizations of terms and utterances with respect to two different choices of conversational context. Designed in particular to contrast replies and predecessors, though other choices of context are also possible.
12+
13+
This is a wrapper that encompasses two instances of `ExpectedContextModelTransformer`, stored at the `ec_models` attribute.
14+
It computes two particular comparative term-level statistics, orientation and shift, stored as the `term_orientations` and `term_shifts` attributes.
15+
It also computes these statistics at the utterance level in the transform step.
16+
17+
:param context_fields: list containing the names of the utterance-level attributes containing the IDs of the context-utterances used by each of the `ExpectedContextModelTransformer` instances.
18+
:param output_prefixes: list containing the name of the attributes and vectors that each `ExpectedContextModelTransformer` instances will write to in the transform step.
19+
:param vect_field: the name of the vectors to use as input vector representation for utterances, as stored in a corpus.
20+
:param context_vect_field: the name of the vectors to use as input vector representations for context-utterances, as stored in a corpus. by default, the transformer will use the same vector representations as utterances, specified in `vect_field`. if you expect that utterances and context-utterances will differ in some way (e.g., they come from speakers in a conversation who play clearly delineated roles), then it's a good idea to use a different input representation.
21+
:param wrapper_output_prefix: the metadata fields where the utterance-level orientation and shift statistics are stored. By default, these attributes are stored as `orn` and `shift` in the metadata; if `wrapper_output_prefix` is specified, then they are stored as `<wrapper_output_prefix>_orn` (orientation) and `<wrapper_output_prefix>_shift` (shift).
22+
:param n_svd_dims: the dimensionality of the representations to derive (via LSA/SVD).
23+
:param snip_first_dim: whether or not to remove the first dimension of the derived representations. by default this is set to `True`, since we've found that the first dimension tends to reflect term frequency, making the output less informative. Note that if `snip_first_dim=True` then in practice, we output `n_svd_dims-1`-dimensional representations.
24+
:param n_clusters: the number of clusters to infer.
25+
:param cluster_on: whether to cluster on utterance or term representations, (corresponding to values `'utts'` or `'terms'`). By default, we infer clusters based on representations of the utterances from the training data, and then assign term and context-utterance representations to the resultant clusters. In some cases (e.g., if utterances are highly unstructured and lengthy) it might be better to cluster term representations first.
26+
:param random_state: the random seed to use in the LSA step (which calls a randomized implementation of SVD)
27+
:param cluster_random_state: the random seed to use to infer clusters.
1028
29+
"""
1130
def __init__(self, context_fields, output_prefixes,
1231
vect_field, context_vect_field=None, wrapper_output_prefix='',
1332
n_svd_dims=25, snip_first_dim=True, n_clusters=8, cluster_on='utts',
1433
random_state=None, cluster_random_state=None):
15-
"""
16-
Transformer that derives and compares characterizations of terms and utterances with respect to two different choices of conversational context. Designed in particular to contrast replies and predecessors, though other choices of context are also possible.
17-
18-
This is a wrapper that encompasses two instances of `ExpectedContextModelTransformer`, stored at the `ec_models` attribute.
19-
It computes two particular comparative term-level statistics, orientation and shift, stored as the `term_orientations` and `term_shifts` attributes.
20-
It also computes these statistics at the utterance level in the transform step.
21-
22-
:param context_fields: list containing the names of the utterance-level attributes containing the IDs of the context-utterances used by each of the `ExpectedContextModelTransformer` instances.
23-
:param output_prefixes: list containing the name of the attributes and vectors that each `ExpectedContextModelTransformer` instances will write to in the transform step.
24-
:param vect_field: the name of the vectors to use as input vector representation for utterances, as stored in a corpus.
25-
:param context_vect_field: the name of the vectors to use as input vector representations for context-utterances, as stored in a corpus. by default,
26-
the transformer will use the same vector representations as utterances, specified in `vect_field`. if you expect that utterances
27-
and context-utterances will differ in some way (e.g., they come from speakers in a conversation who play clearly delineated roles),
28-
then it's a good idea to use a different input representation.
29-
:param wrapper_output_prefix: the metadata fields where the utterance-level orientation and shift statistics are stored. By default, these attributes are stored as `orn` and `shift` in the metadata; if `wrapper_output_prefix` is specified, then they are stored as `<wrapper_output_prefix>_orn` (orientation) and `<wrapper_output_prefix>_shift` (shift).
30-
:param n_svd_dims: the dimensionality of the representations to derive (via LSA/SVD).
31-
:param snip_first_dim: whether or not to remove the first dimension of the derived representations. by default this is set to `True`, since we've
32-
found that the first dimension tends to reflect term frequency, making the output less informative. Note that if `snip_first_dim=True`
33-
then in practice, we output `n_svd_dims-1`-dimensional representations.
34-
:param n_clusters: the number of clusters to infer.
35-
:param cluster_on: whether to cluster on utterance or term representations, (corresponding to values `'utts'` or `'terms'`). By default, we infer clusters
36-
based on representations of the utterances from the training data, and then assign term and context-utterance representations to the resultant clusters.
37-
In some cases (e.g., if utterances are highly unstructured and lengthy) it might
38-
be better to cluster term representations first.
39-
:param random_state: the random seed to use in the LSA step (which calls a randomized implementation of SVD)
40-
:param cluster_random_state: the random seed to use to infer clusters.
4134

42-
"""
4335
self.context_fields = context_fields
4436
self.output_prefixes = output_prefixes
4537
self.vect_field = vect_field
@@ -76,10 +68,8 @@ def fit(self, corpus, y=None, selector=lambda x: True, context_selector=lambda x
7668
Fits a transformer over training data: fits the two `ExpectedContextModelTransformer` instances, and computes term-level orientation and shift.
7769
7870
:param corpus: Corpus containing training data
79-
:param selector: a boolean function of signature `filter(utterance)` that determines which utterances
80-
will be considered in the fit step. defaults to using all utterances.
81-
:param context_selector: a boolean function of signature `filter(utterance)` that determines which context-utterances
82-
will be considered in the fit step. defaults to using all utterances.
71+
:param selector: a boolean function of signature `filter(utterance)` that determines which utterances will be considered in the fit step. defaults to using all utterances.
72+
:param context_selector: a boolean function of signature `filter(utterance)` that determines which context-utterances will be considered in the fit step. defaults to using all utterances.
8373
:return: None
8474
"""
8575

@@ -94,8 +84,7 @@ def transform(self, corpus, selector=lambda x: True):
9484
Computes vector representations, ranges, and cluster assignments for utterances in a corpus, using the two `ExpectedContextModelTransformer` instances. Also computes utterance-level orientation and shift.
9585
9686
:param corpus: Corpus
97-
:param selector: a boolean function of signature `filter(utterance)` that determines which utterances
98-
to transform. defaults to all utterances.
87+
:param selector: a boolean function of signature `filter(utterance)` that determines which utterances to transform. defaults to all utterances.
9988
:return: the Corpus, with per-utterance attributes.
10089
"""
10190
self.ec_models[0].transform(corpus, selector=selector)
@@ -169,9 +158,10 @@ def get_term_df(self):
169158
def summarize(self, k=10, max_chars=1000, corpus=None):
170159
"""
171160
For each constituent ExpectedContextModelTransformer, prints inferred clusters and statistics about their sizes.
161+
172162
:param k: number of examples to print out.
173-
:max_chars: maximum number of characters per utterance/context-utterance to print. Can be toggled to control the size of the output.
174-
:corpus: optional, the corpus that the transformer was trained on. if set, will print example utterances and context-utterances as well as terms.
163+
:param max_chars: maximum number of characters per utterance/context-utterance to print. Can be toggled to control the size of the output.
164+
:param corpus: optional, the corpus that the transformer was trained on. if set, will print example utterances and context-utterances as well as terms.
175165
176166
:return: None
177167
"""

0 commit comments

Comments
 (0)