diff --git a/docs/build/doctrees/environment.pickle b/docs/build/doctrees/environment.pickle index 4bbbf27c..6af921cf 100644 Binary files a/docs/build/doctrees/environment.pickle and b/docs/build/doctrees/environment.pickle differ diff --git a/docs/build/doctrees/examples.doctree b/docs/build/doctrees/examples.doctree index a7aa4728..f50cf40b 100644 Binary files a/docs/build/doctrees/examples.doctree and b/docs/build/doctrees/examples.doctree differ diff --git a/docs/build/doctrees/feature_builder.doctree b/docs/build/doctrees/feature_builder.doctree index 3435da79..d794f8ae 100644 Binary files a/docs/build/doctrees/feature_builder.doctree and b/docs/build/doctrees/feature_builder.doctree differ diff --git a/docs/build/doctrees/features/index.doctree b/docs/build/doctrees/features/index.doctree index f8e916e8..bd61a581 100644 Binary files a/docs/build/doctrees/features/index.doctree and b/docs/build/doctrees/features/index.doctree differ diff --git a/docs/build/doctrees/utils/calculate_conversation_level_features.doctree b/docs/build/doctrees/utils/calculate_conversation_level_features.doctree index f4b51cf2..3f410982 100644 Binary files a/docs/build/doctrees/utils/calculate_conversation_level_features.doctree and b/docs/build/doctrees/utils/calculate_conversation_level_features.doctree differ diff --git a/docs/build/doctrees/utils/calculate_user_level_features.doctree b/docs/build/doctrees/utils/calculate_user_level_features.doctree index d52241cf..f2586280 100644 Binary files a/docs/build/doctrees/utils/calculate_user_level_features.doctree and b/docs/build/doctrees/utils/calculate_user_level_features.doctree differ diff --git a/docs/build/doctrees/utils/summarize_features.doctree b/docs/build/doctrees/utils/summarize_features.doctree index 5d3249f2..71e2a0c6 100644 Binary files a/docs/build/doctrees/utils/summarize_features.doctree and b/docs/build/doctrees/utils/summarize_features.doctree differ diff --git a/docs/build/html/_sources/examples.rst.txt b/docs/build/html/_sources/examples.rst.txt index 8d39f537..d3c0c97f 100644 --- a/docs/build/html/_sources/examples.rst.txt +++ b/docs/build/html/_sources/examples.rst.txt @@ -1,10 +1,12 @@ .. _examples: +================ Worked Example ================ +------------------- Demo / Sample Code -******************* +------------------- After following the "Getting Started" steps below, the Team Communication Toolkit can be imported at the top of any Python script. We have provided a simple example file, "featurize.py", and a demo notebook, "demo.ipynb," under our `examples folder `_ on GitHub. @@ -17,7 +19,7 @@ We also have demos available on Google Colab that you can copy and run on your o Finally, this page will walk you through a case study, highlighting top use cases and considerations when using the toolkit. Getting Started -**************** +================= To use our tool, please ensure that you have Python >= 3.10 installed and a working version of `pip `_, which is Python's package installer. Then, in your local environment, run the following: @@ -28,7 +30,7 @@ To use our tool, please ensure that you have Python >= 3.10 installed and a work This command will automatically install our package and all required dependencies. Troubleshooting -++++++++++++++++ +----------------- In the event that some dependency installations fail (for example, you may get an error that ``en_core_web_sm`` from Spacy is not found, or that there is a missing NLTK resource), please run this simple one-line command in your terminal, which will force the installation of Spacy and NLTK dependencies: @@ -41,14 +43,14 @@ If you encounter a further issue in which the 'wordnet' package from NLTK is not You can also find a full list of our requirements `here `_. Import Recommendations: Virtual Environment and Pip -+++++++++++++++++++++++++++++++++++++++++++++++++++++ +----------------------------------------------------- **We strongly recommend using a virtual environment in Python to run the package.** We have several specific dependency requirements. One important one is that we are currently only compatible with numpy < 2.0.0 because `numpy 2.0.0 and above `_ made significant changes that are not compatible with other dependencies of our package. As those dependencies are updated, we will support later versions of numpy. **We also strongly recommend that your version of pip is up-to-date (>=24.0).** There have been reports in which users have had trouble downloading dependencies (specifically, the Spacy package) with older versions of pip. If you get an error with downloading ``en_core_web_sm``, we recommend updating pip. Importing the Package -++++++++++++++++++++++ +----------------------- After you import the package and install dependencies, you can then use our tool in your Python script as follows: @@ -61,12 +63,12 @@ Now you have access to the :ref:`feature_builder`. This is the main class that y *Note*: PyPI treats hyphens and underscores equally, so "pip install team_comm_tools" and "pip install team-comm-tools" are equivalent. However, Python does NOT treat them equally, and **you should use underscores when you import the package, like this: from team_comm_tools import FeatureBuilder**. Walkthrough: Running the FeatureBuilder on Your Data -***************************************************** +======================================================= Next, we'll go through the details of running the FeatureBuilder on your data, discussing each of the specific options / parameters at your disposal. Configuring the FeatureBuilder -++++++++++++++++++++++++++++++++ +-------------------------------- The FeatureBuilder accepts any Pandas DataFrame as the input, so you can read in data in whatever format you like. For the purposes of this walkthrough, we'll be using some jury deliberation data from `Hu et al. (2021) `_. @@ -95,10 +97,10 @@ Now we are ready to call the FeatureBuilder on our data. All we need to do is de jury_feature_builder.featurize() Basic Input Columns -^^^^^^^^^^^^^^^^^^^^ +~~~~~~~~~~~~~~~~~~~~ Conversation Parameters -""""""""""""""""""""""""" +************************** * The **input_df** parameter is where you pass in your dataframe. In this case, we want to run the FeatureBuilder on the juries data that we read in! @@ -133,7 +135,7 @@ Conversation Parameters conversation_id_col = "batch_num" Vector Directory -"""""""""""""""""" +******************* * The **vector_directory** is the name of a directory in which we will store some pre-processed information. Some features require running inference from HuggingFace's `RoBERTa-based sentiment model `_, and others require generating `SBERT vectors `_. These processes take time, and we cache the outputs so that subsequent runs of the FeatureBuilder on the same dataset will not take as much time. Therefore, we require you to pass in a location where you'd like us to save these outputs. @@ -148,7 +150,7 @@ Vector Directory .. _output_file_details: Output File Naming Details -"""""""""""""""""""""""""""" +***************************** * There are three output files for each run of the FeatureBuilder, which mirror the three levels of analysis: utterance-, speaker-, and conversation-level. (Please see the section on `Generating Features: Utterance-, Speaker-, and Conversation-Level `_ for more details.) These are generated using the **output_file_base** parameter. @@ -188,7 +190,7 @@ Output File Naming Details Turns -"""""" +****** * The **turns** parameter controls whether we want to treat successive messages from the same person as a single turn. For example, in a text conversation, sometimes individuals will send many message in rapid succession, as follows: @@ -204,13 +206,20 @@ Turns Advanced Configuration Columns -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + More advanced users of the FeatureBuilder should consider the following optional parameters, depending on their needs. +Regenerating Vector Cache +*************************** + * The **regenerate_vectors** parameter controls whether you'd like the FeatureBuilder to re-generate the content in the **vector_directory**, even if we have already cached the output of a previous run. It is useful if the underlying data has changed, but you want to give the output file the same name as a previous run of the FeatureBuilder. * By default, **we assume that, if your output file is named the same, that the underlying vectors are the same**. If this isn't true, you should set **regenerate_vectors = True** in order to clear out the cache and re-generate the RoBERTa and SBERT outputs. +Custom Features +***************** + * The **custom_features** parameter allows you to specify features that do not exist within our default set. **We default to NOT generating four features that depend on SBERT vectors, as the process for generating the vectors tends to be slow.** However, these features can provide interesting insights into the extent to which individuals in a conversation speak "similarly" or not, based on a vector similarity metric. To access these features, simply use the **custom_features** parameter: .. code-block:: python @@ -224,6 +233,9 @@ More advanced users of the FeatureBuilder should consider the following optional * You can chose to add any of these features depending on your preference. +Analyzing First Percentage (%) +******************************** + * The **analyze_first_pct** parameter allows you to "cut off" and separately analyze the first X% of a conversation, in case you wish to separately study different sections of a conversation as it progresses. For example, you may be interested in knowing how the attributes of the first 50% of a conversation differ from the attributes of the entire conversation. Then you can sepcify the following: .. code-block:: python @@ -234,8 +246,105 @@ More advanced users of the FeatureBuilder should consider the following optional * By default, we will simply analyze 100% of each conversation. +Named Entity Recognition +************************** + * The parameters **ner_training_df** and **ner_cutoff** are required if you would like the FeatureBuilder to identify named entities in your conversations. For example, the sentence, "John, did you talk to Michael this morning?" has two named entities: "John" and "Michael." The FeatureBuilder includes a tool that automatically detects these named entities, but it requires the user (you!) to specify some training data with examples of the types of named entities you'd like to recognize. This is because proper nouns can take many forms, from standard Western-style names (e.g., "John") to pseudonymous online nicknames (like "littleHorse"). More information about these parameters can be found in :ref:`named_entity_recognition`. +.. _custom_aggregation: + +Custom Aggregation +******************** + +Imagine that you, as a researcher, are interested in high-level characteristics of the entire conversation (for example, how much is said), but you only have measures at the (lower) level of each individual utterance (for example, the number of words in each message). How would you "aggregate" information from the lower level to the higher level? + +A simple solution is to sum up to the total number of words per utterance, and group by the conversation identifier. Then, you would have the total number of words for the entire conversation. You can imagine doing similar aggregations for other types of statistics --- for example, the average number of words, the variance in the number of words, and so on. + +The FeautureBuilder includes built-in functionality to perform aggregations across different levels of analysis. By default, all numeric attributes generated at the utterance (chat) level are aggregated using the functions ``mean``, ``max``, ``min``, and ``stdev``. + +We perform three types of aggregations. Consider, for example, a conversation with messages containing 5, 10, and 15 words. Then we would have the following: + +- **Conversation-Level Aggregates** transform statistics at the level of an utterance (chat) to the level of a conversation. An example is the mean number of words per utterance (10) and the maximum number of words in any utterance (15). +- **Speaker(User)-Level Aggregates** transform statistics at the level of an utterance (chat) to the level of a given speaker (user; participant) in a conversation. An example is the mean number of words per message by a particular speaker. +- **Conversation-Level Aggregates of Speaker-Level Information**: transform information about the speakers (users; participants) to the level of a conversation. An example is the average number of words for the most talkative speaker. + +Given that there are multiple default aggregation functions and numerous utterance-level attributes, an (overwhelmingly) large number of aggregation statistics can be produced. As of **v.0.1.5**, aggregation behavior can be customized using the following parameters: + +- ``convo_aggregation``: A boolean that defaults to ``True``; when turned to ``False``, aggregation at the conversation level is disabled **[NOTE 1]**. +- ``convo_methods``: A list specifying which aggregation methods to use at the conversation level. Options include ``mean``, ``max``, ``min``, ``stdev``, ``median``, and ``sum`` **[NOTE 2]; [NOTE 3]**. We default to using ``mean``, ``max``, ``min``, and ``stdev``. +- ``convo_columns``: A list specifying which utterance-level attributes to aggregate to the conversation level. These should be valid columns in the utterance (chat)-level data. This defaults to ``None``, which is configured to aggregate all available numeric outputs. + +Equivalent parameters for the speaker (user) level are: + +- ``user_aggregation``: A boolean that defaults to ``True``; when turned to ``False``, aggregation at the speaker (user) level is disabled **[NOTE 1]**. +- ``user_methods``: A list specifying which aggregation methods to use at the speaker/user level (with the same options as the conversation level). +- ``user_columns``: A list specifying which utterance-level attributes to aggregate at the speaker/user level. + +The table below summarizes the different types of aggregation, and the ways in which they can be customized: + +.. list-table:: Aggregation Overview + :header-rows: 1 + :widths: 20 15 20 20 10 15 25 + + * - Aggregation Type + - Default Methods + - Methods Available + - Customization Parameters + - Output DataFrame + - Example Aggregation + - Interpretation + * - Utterance (Chat) -> Conversation + - ``mean``, ``max``, ``min``, ``stdev`` + - ``mean``, ``max``, ``min``, ``stdev``, ``median``, ``sum`` + - ``convo_aggregation``, ``convo_methods``, ``convo_columns`` + - Conversation + - ``mean_num_words`` + - Average number of words per utterance in the conversation + * - Utterance (Chat) -> Speaker/User + - ``mean``, ``max``, ``min``, ``stdev`` + - ``mean``, ``max``, ``min``, ``stdev``, ``median``, ``sum`` + - ``user_aggregation``, ``user_methods``, ``user_columns`` + - Speaker/User + - ``mean_num_words`` + - Average number of words per utterance for a given individual + * - Speaker (User) -> Conversation + - ``mean``, ``max``, ``min``, ``stdev`` + - ``mean``, ``max``, ``min``, ``stdev``, ``median``, ``sum`` + - ``convo_aggregation``, ``convo_methods``, ``convo_columns`` + - Conversation + - ``max_user_mean_num_words`` + - Average number of words per utterance for the person who talked the most + + +Example Usage of Custom Aggregation Parameters ++++++++++++++++++++++++++++++++++++++++++++++++ + +To customize aggregation behavior, simply add the following when constructing your FeatureBuilder: + +.. code-block:: python + + convo_methods = ['max', 'median'] # This aggregates ONLY "positive_bert" at the conversation level using max and median. + convo_columns = ['positive_bert'], + user_methods = ['mean'] # This aggregates ONLY "negative_bert" at the speaker/user level using mean. + user_columns = ['negative_bert'] + +To turn off aggregation, set the following parameters to ``False``. By default, both are ``True`` as aggregation is performed automatically: + +.. code-block:: python + + convo_aggregation = False + user_aggregation = False + +Important Notes and Caveats +++++++++++++++++++++++++++++ + +- **[NOTE 1]** Even when aggregation is disabled, totals of words, messages, and characters are still summarized, as these are required for calculating the Gini Coefficient features. +- **[NOTE 2]** Be careful when choosing the "sum" aggregation method, as it is not always appropriate to use the "sum" as an aggregation function. While it is a sensible choice for utterance-level attributes that are *countable* (for example, the total number of words, or other lexical wordcounts), it is a less sensible choice for others (for example, it does not make sense to sum sentiment scores for each utterance in a conversation). Consequently, using the "sum" feature will come with an associated warning. +- **[NOTE 3]** In addition to aggregating from the utterance (chat) level to the conversation level, we also aggregate from the speaker (user) level to the conversation level, using the same methods specified in ``convo_methods`` to do so. + +Cumulative Grouping +********************* + * The parameters **cumulative_grouping** and **within_task** address a special case of having multiple conversational identifiers; **they assume that the same team has multiple sequential conversations, and that, in each conversation, they perform one or more separate activities**. This was originally created as a companion to a multi-stage Empirica game (see: ``_). For example, imagine that a team must complete 3 different tasks, each with 3 different subparts. Then we can model this event in terms of 1 team (High level), 3 tasks (Mid level), and 3 subparts per task (Low level). * In such an activity, we assume that there are three levels of identifiers: High, Mid, and Low. @@ -297,7 +406,7 @@ More advanced users of the FeatureBuilder should consider the following optional * Finally, it is important to remember that, since cumulative groupings mean that we progressively consider more and more of the same conversation, **your conversation dataframe will substantially increase in size**, and this may affect the runtime of your FeatureBuilder. Additional FeatureBuilder Considerations -++++++++++++++++++++++++++++++++++++++++ +------------------------------------------ Here are some additional design details of the FeatureBuilder that you may wish to keep in mind: @@ -308,10 +417,11 @@ Here are some additional design details of the FeatureBuilder that you may wish * **When summarizing features from the utterance level to the conversation and speaker level, we only consider numeric features.** This is perhaps a simplifying assumption more than anything else; although we do extract non-numeric information (for example, a Dale-Chall label of whether an utterance is "Easy" to ready or not; a list of named entities identified), we cannot summarize these efficiently, so they are not considered. Inspecting Generated Features -++++++++++++++++++++++++++++++ +-------------------------------- Feature Information -^^^^^^^^^^^^^^^^^^^^^ +~~~~~~~~~~~~~~~~~~~~ + Every FeatureBuilder object has an underlying property called the **feature_dict**, which lists information and references about the features included in the toolkit. Assuming that **jury_feature_builder** is the name of your FeatureBuilder, you can access the feature dictionary as follows: .. code-block:: python @@ -350,7 +460,7 @@ Here is some example output (for the RoBERTa sentiment feature): 'bert_sentiment_data': True} Feature Column Names -^^^^^^^^^^^^^^^^^^^^^ +~~~~~~~~~~~~~~~~~~~~~~ Once you call **.featurize()**, you can also obtain a convenient list of the feature columns generated by the toolkit: diff --git a/docs/build/html/_sources/features/index.rst.txt b/docs/build/html/_sources/features/index.rst.txt index 1f67bc29..be1e826c 100644 --- a/docs/build/html/_sources/features/index.rst.txt +++ b/docs/build/html/_sources/features/index.rst.txt @@ -32,7 +32,11 @@ Utterance-Level features are calculated *first* in the Toolkit, as many conversa Conversation-Level Features **************************** -Once utterance-level features are computed, we compute conversation-level features; some of these features represent an aggregation of utterance-level information (for example, the "average level of positivity" in a conversation is simply the mean positivity score for each utterance). Other conversation-level features are constructs that are defined only at the conversation-level, such as the level of "burstiness" in a team's communication patterns. + +Base Conversation-Level Features ++++++++++++++++++++++++++++++++++++ + +The following features are constructs that are defined only at the conversation-level, such as the level of "burstiness" in a team's communication patterns. We call these the "base" conversation-level features, and they can be accessed using a property of the ``FeatureBuilder`` object: ``FeatureBuilder.conv_features_base``. .. toctree:: :maxdepth: 1 @@ -46,12 +50,17 @@ Once utterance-level features are computed, we compute conversation-level featur within_person_discursive_range turn_taking_features +Conversation-Level Aggregates ++++++++++++++++++++++++++++++++++++ +Once utterance-level features are computed, we compute conversation-level features; some of these features represent an aggregation of utterance-level information (for example, the "average level of positivity" in a conversation is simply the mean positivity score for each utterance). + +By default, all numeric attributes generated at the utterance (chat) level are aggregated using the functions ``mean``, ``max``, ``min``, and ``stdev``. However, this behavior can be customized, with details in the Worked Example (see :ref:`custom_aggregation`). + Speaker- (User) Level Features ********************************* User-level features generally represent an aggregation of features at the utterance- level (for example, the average number of words spoken *by a particular user*). There is therefore limited speaker-level feature documentation, other than a function used to compute the "network" of other speakers that an individual interacts with in a conversation. -You may reference the :ref:`Speaker (User)-Level Features Page ` for more information. - +You may reference the :ref:`Speaker (User)-Level Features Page ` for more information, as well as the details in the Worked Example (see :ref:`custom_aggregation`). .. toctree:: :maxdepth: 1 diff --git a/docs/build/html/examples.html b/docs/build/html/examples.html index f7ce3f08..0efbca9c 100644 --- a/docs/build/html/examples.html +++ b/docs/build/html/examples.html @@ -49,23 +49,17 @@
  • Introduction
  • The Basics (Get Started Here!)
  • Worked Example
      -
    • Demo / Sample Code
    • -
    • Getting Started
        -
      • Troubleshooting
      • -
      • Import Recommendations: Virtual Environment and Pip
      • -
      • Importing the Package
      • +
      • Demo / Sample Code
          +
        • Getting Started
        • -
        • Walkthrough: Running the FeatureBuilder on Your Data

          Finally, this page will walk you through a case study, highlighting top use cases and considerations when using the toolkit.

          -
          -

          Getting Started

          +

          Getting Started

          To use our tool, please ensure that you have Python >= 3.10 installed and a working version of pip, which is Python’s package installer. Then, in your local environment, run the following:

          pip install team_comm_tools
           

          This command will automatically install our package and all required dependencies.

          -

          Troubleshooting

          +

          Troubleshooting

          In the event that some dependency installations fail (for example, you may get an error that en_core_web_sm from Spacy is not found, or that there is a missing NLTK resource), please run this simple one-line command in your terminal, which will force the installation of Spacy and NLTK dependencies:

          download_resources
           
          @@ -131,12 +124,12 @@

          Troubleshootinghere.

          -

          Import Recommendations: Virtual Environment and Pip

          +

          Import Recommendations: Virtual Environment and Pip

          We strongly recommend using a virtual environment in Python to run the package. We have several specific dependency requirements. One important one is that we are currently only compatible with numpy < 2.0.0 because numpy 2.0.0 and above made significant changes that are not compatible with other dependencies of our package. As those dependencies are updated, we will support later versions of numpy.

          We also strongly recommend that your version of pip is up-to-date (>=24.0). There have been reports in which users have had trouble downloading dependencies (specifically, the Spacy package) with older versions of pip. If you get an error with downloading en_core_web_sm, we recommend updating pip.

          -

          Importing the Package

          +

          Importing the Package

          After you import the package and install dependencies, you can then use our tool in your Python script as follows:

          from team_comm_tools import FeatureBuilder
           
          @@ -146,10 +139,10 @@

          Importing the Package

          -

          Walkthrough: Running the FeatureBuilder on Your Data

          +

          Walkthrough: Running the FeatureBuilder on Your Data

          Next, we’ll go through the details of running the FeatureBuilder on your data, discussing each of the specific options / parameters at your disposal.

          -

          Configuring the FeatureBuilder

          +

          Configuring the FeatureBuilder

          The FeatureBuilder accepts any Pandas DataFrame as the input, so you can read in data in whatever format you like. For the purposes of this walkthrough, we’ll be using some jury deliberation data from Hu et al. (2021).

          We first import Pandas and read in the dataframe:

          import pandas as pd
          @@ -171,9 +164,9 @@ 

          Configuring the FeatureBuilder -

          Basic Input Columns

          +

          Basic Input Columns
          -
          Conversation Parameters
          +
          Conversation Parameters
          • The input_df parameter is where you pass in your dataframe. In this case, we want to run the FeatureBuilder on the juries data that we read in!

          • The speaker_id_col refers to the name of the column containing a unique identifier for each speaker / participant in the conversation. Here, in the data, the name of our columns is called “speaker_nickname.”

            @@ -216,7 +209,7 @@
            Conversation Parameters
          -
          Vector Directory
          +
          Vector Directory
          • The vector_directory is the name of a directory in which we will store some pre-processed information. Some features require running inference from HuggingFace’s RoBERTa-based sentiment model, and others require generating SBERT vectors. These processes take time, and we cache the outputs so that subsequent runs of the FeatureBuilder on the same dataset will not take as much time. Therefore, we require you to pass in a location where you’d like us to save these outputs.

            @@ -231,7 +224,7 @@
            Vector Directory -
            Output File Naming Details
            +
            Output File Naming Details
            • There are three output files for each run of the FeatureBuilder, which mirror the three levels of analysis: utterance-, speaker-, and conversation-level. (Please see the section on Generating Features: Utterance-, Speaker-, and Conversation-Level for more details.) These are generated using the output_file_base parameter.

              @@ -270,7 +263,7 @@
              Vector Directory -
              Turns
              +
              Turns
              • The turns parameter controls whether we want to treat successive messages from the same person as a single turn. For example, in a text conversation, sometimes individuals will send many message in rapid succession, as follows:

                @@ -292,8 +285,10 @@
                Turns
          -

          Advanced Configuration Columns

          +
          Advanced Configuration Columns

          More advanced users of the FeatureBuilder should consider the following optional parameters, depending on their needs.

          +
          +
          Regenerating Vector Cache
          • The regenerate_vectors parameter controls whether you’d like the FeatureBuilder to re-generate the content in the vector_directory, even if we have already cached the output of a previous run. It is useful if the underlying data has changed, but you want to give the output file the same name as a previous run of the FeatureBuilder.

            @@ -302,6 +297,11 @@

            Advanced Configuration Columns +
            Custom Features
            +
            • The custom_features parameter allows you to specify features that do not exist within our default set. We default to NOT generating four features that depend on SBERT vectors, as the process for generating the vectors tends to be slow. However, these features can provide interesting insights into the extent to which individuals in a conversation speak “similarly” or not, based on a vector similarity metric. To access these features, simply use the custom_features parameter:

              @@ -318,6 +318,11 @@

              Advanced Configuration Columns +
              Analyzing First Percentage (%)
              +
              • The analyze_first_pct parameter allows you to “cut off” and separately analyze the first X% of a conversation, in case you wish to separately study different sections of a conversation as it progresses. For example, you may be interested in knowing how the attributes of the first 50% of a conversation differ from the attributes of the entire conversation. Then you can sepcify the following:

                analyze_first_pct: [0.5, 1.0]
                @@ -329,7 +334,113 @@ 

                Advanced Configuration Columns +
                Named Entity Recognition
                +
                • The parameters ner_training_df and ner_cutoff are required if you would like the FeatureBuilder to identify named entities in your conversations. For example, the sentence, “John, did you talk to Michael this morning?” has two named entities: “John” and “Michael.” The FeatureBuilder includes a tool that automatically detects these named entities, but it requires the user (you!) to specify some training data with examples of the types of named entities you’d like to recognize. This is because proper nouns can take many forms, from standard Western-style names (e.g., “John”) to pseudonymous online nicknames (like “littleHorse”). More information about these parameters can be found in Named Entity Recognition.

                • +
                +

          +
          +
          Custom Aggregation
          +

          Imagine that you, as a researcher, are interested in high-level characteristics of the entire conversation (for example, how much is said), but you only have measures at the (lower) level of each individual utterance (for example, the number of words in each message). How would you “aggregate” information from the lower level to the higher level?

          +

          A simple solution is to sum up to the total number of words per utterance, and group by the conversation identifier. Then, you would have the total number of words for the entire conversation. You can imagine doing similar aggregations for other types of statistics — for example, the average number of words, the variance in the number of words, and so on.

          +

          The FeautureBuilder includes built-in functionality to perform aggregations across different levels of analysis. By default, all numeric attributes generated at the utterance (chat) level are aggregated using the functions mean, max, min, and stdev.

          +

          We perform three types of aggregations. Consider, for example, a conversation with messages containing 5, 10, and 15 words. Then we would have the following:

          +
            +
          • Conversation-Level Aggregates transform statistics at the level of an utterance (chat) to the level of a conversation. An example is the mean number of words per utterance (10) and the maximum number of words in any utterance (15).

          • +
          • Speaker(User)-Level Aggregates transform statistics at the level of an utterance (chat) to the level of a given speaker (user; participant) in a conversation. An example is the mean number of words per message by a particular speaker.

          • +
          • Conversation-Level Aggregates of Speaker-Level Information: transform information about the speakers (users; participants) to the level of a conversation. An example is the average number of words for the most talkative speaker.

          • +
          +

          Given that there are multiple default aggregation functions and numerous utterance-level attributes, an (overwhelmingly) large number of aggregation statistics can be produced. As of v.0.1.5, aggregation behavior can be customized using the following parameters:

          +
            +
          • convo_aggregation: A boolean that defaults to True; when turned to False, aggregation at the conversation level is disabled [NOTE 1].

          • +
          • convo_methods: A list specifying which aggregation methods to use at the conversation level. Options include mean, max, min, stdev, median, and sum [NOTE 2]; [NOTE 3]. We default to using mean, max, min, and stdev.

          • +
          • convo_columns: A list specifying which utterance-level attributes to aggregate to the conversation level. These should be valid columns in the utterance (chat)-level data. This defaults to None, which is configured to aggregate all available numeric outputs.

          • +
          +

          Equivalent parameters for the speaker (user) level are:

          +
            +
          • user_aggregation: A boolean that defaults to True; when turned to False, aggregation at the speaker (user) level is disabled [NOTE 1].

          • +
          • user_methods: A list specifying which aggregation methods to use at the speaker/user level (with the same options as the conversation level).

          • +
          • user_columns: A list specifying which utterance-level attributes to aggregate at the speaker/user level.

          • +
          +

          The table below summarizes the different types of aggregation, and the ways in which they can be customized:

          + + +++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
          Aggregation Overview

          Aggregation Type

          Default Methods

          Methods Available

          Customization Parameters

          Output DataFrame

          Example Aggregation

          Interpretation

          Utterance (Chat) -> Conversation

          mean, max, min, stdev

          mean, max, min, stdev, median, sum

          convo_aggregation, convo_methods, convo_columns

          Conversation

          mean_num_words

          Average number of words per utterance in the conversation

          Utterance (Chat) -> Speaker/User

          mean, max, min, stdev

          mean, max, min, stdev, median, sum

          user_aggregation, user_methods, user_columns

          Speaker/User

          mean_num_words

          Average number of words per utterance for a given individual

          Speaker (User) -> Conversation

          mean, max, min, stdev

          mean, max, min, stdev, median, sum

          convo_aggregation, convo_methods, convo_columns

          Conversation

          max_user_mean_num_words

          Average number of words per utterance for the person who talked the most

          +
          +
          Example Usage of Custom Aggregation Parameters
          +

          To customize aggregation behavior, simply add the following when constructing your FeatureBuilder:

          +
          convo_methods = ['max', 'median']  # This aggregates ONLY "positive_bert" at the conversation level using max and median.
          +convo_columns = ['positive_bert'],
          +user_methods = ['mean']            # This aggregates ONLY "negative_bert" at the speaker/user level using mean.
          +user_columns = ['negative_bert']
          +
          +
          +

          To turn off aggregation, set the following parameters to False. By default, both are True as aggregation is performed automatically:

          +
          convo_aggregation = False
          +user_aggregation = False
          +
          +
          +
          +
          +
          Important Notes and Caveats
          +
            +
          • [NOTE 1] Even when aggregation is disabled, totals of words, messages, and characters are still summarized, as these are required for calculating the Gini Coefficient features.

          • +
          • [NOTE 2] Be careful when choosing the “sum” aggregation method, as it is not always appropriate to use the “sum” as an aggregation function. While it is a sensible choice for utterance-level attributes that are countable (for example, the total number of words, or other lexical wordcounts), it is a less sensible choice for others (for example, it does not make sense to sum sentiment scores for each utterance in a conversation). Consequently, using the “sum” feature will come with an associated warning.

          • +
          • [NOTE 3] In addition to aggregating from the utterance (chat) level to the conversation level, we also aggregate from the speaker (user) level to the conversation level, using the same methods specified in convo_methods to do so.

          • +
          +
          +
          +
          +
          Cumulative Grouping
          +
          • The parameters cumulative_grouping and within_task address a special case of having multiple conversational identifiers; they assume that the same team has multiple sequential conversations, and that, in each conversation, they perform one or more separate activities. This was originally created as a companion to a multi-stage Empirica game (see: https://github.com/Watts-Lab/multi-task-empirica). For example, imagine that a team must complete 3 different tasks, each with 3 different subparts. Then we can model this event in terms of 1 team (High level), 3 tasks (Mid level), and 3 subparts per task (Low level).

              @@ -393,8 +504,9 @@

              Advanced Configuration Columns -

              Additional FeatureBuilder Considerations

              +

              Additional FeatureBuilder Considerations

              Here are some additional design details of the FeatureBuilder that you may wish to keep in mind:

                @@ -410,9 +522,9 @@

                Additional FeatureBuilder Considerations -

                Inspecting Generated Features

                +

                Inspecting Generated Features

                -

                Feature Information

                +
                Feature Information

                Every FeatureBuilder object has an underlying property called the feature_dict, which lists information and references about the features included in the toolkit. Assuming that jury_feature_builder is the name of your FeatureBuilder, you can access the feature dictionary as follows:

                jury_feature_builder.feature_dict
                 
                @@ -443,7 +555,7 @@

                Feature Information -

                Feature Column Names

                +

                Feature Column Names

                Once you call .featurize(), you can also obtain a convenient list of the feature columns generated by the toolkit:

                jury_feature_builder.chat_features # a list of the feature columns generated at the chat (utterance) level
                 jury_feature_builder.conv_features_base # a list of the base (non-aggregated) feature columns at the conversation level
                @@ -457,6 +569,7 @@ 

                Feature Column Names

          +
          diff --git a/docs/build/html/feature_builder.html b/docs/build/html/feature_builder.html index bffe9610..62bd0419 100644 --- a/docs/build/html/feature_builder.html +++ b/docs/build/html/feature_builder.html @@ -97,46 +97,75 @@

          feature_builder module

          -class feature_builder.FeatureBuilder(input_df: DataFrame, vector_directory: str = './vector_data/', output_file_base: str = 'output', output_file_path_chat_level: str = None, output_file_path_user_level: str = None, output_file_path_conv_level: str = None, custom_features: list = [], analyze_first_pct: list = [1.0], turns: bool = False, conversation_id_col: str = 'conversation_num', speaker_id_col: str = 'speaker_nickname', message_col: str = 'message', timestamp_col: str | tuple[str, str] = 'timestamp', timestamp_unit='ms', grouping_keys: list = [], cumulative_grouping=False, within_task=False, ner_training_df: DataFrame = None, ner_cutoff: int = 0.9, regenerate_vectors: bool = False, compute_vectors_from_preprocessed: bool = False)
          +class feature_builder.FeatureBuilder(input_df: DataFrame, vector_directory: str = './vector_data/', output_file_base: str = 'output', output_file_path_chat_level: str = None, output_file_path_user_level: str = None, output_file_path_conv_level: str = None, custom_features: list = [], analyze_first_pct: list = [1.0], turns: bool = False, conversation_id_col: str = 'conversation_num', speaker_id_col: str = 'speaker_nickname', message_col: str = 'message', timestamp_col: str | tuple[str, str] = 'timestamp', timestamp_unit='ms', grouping_keys: list = [], cumulative_grouping=False, within_task=False, ner_training_df: DataFrame = None, ner_cutoff: int = 0.9, regenerate_vectors: bool = False, compute_vectors_from_preprocessed: bool = False, convo_aggregation=True, convo_methods: list = ['mean', 'max', 'min', 'stdev'], convo_columns: list = None, user_aggregation=True, user_methods: list = ['mean', 'max', 'min', 'stdev'], user_columns: list = None)

          Bases: object

          The FeatureBuilder is the main engine that reads in the user’s inputs and specifications and generates -conversational features. The FeatureBuilder separately calls the classes (the ChatLevelFeaturesCalculator, -ConversationLevelFeaturesCalculator, and UserLevelFeaturesCalculator) to generate conversational features at -different levels.

          +conversational features. The FeatureBuilder separately calls the classes +(ChatLevelFeaturesCalculator, ConversationLevelFeaturesCalculator, and +UserLevelFeaturesCalculator) to generate conversational features at different levels.

          Parameters:
          • input_df (pd.DataFrame) – A pandas DataFrame containing the conversation data that you wish to featurize.

          • -
          • vector_directory (str) – Directory path where the vectors are to be cached. Defaults to “./vector_data/”

          • -
          • output_file_base (str) – Base name for the output files, which will be used to auto-generate filenames for each of the three levels. Defaults to “output.”

          • -
          • output_file_path_chat_level (str) – Path where the chat (utterance)-level output csv file is to be generated. (This parameter will override the base name.)

          • -
          • output_file_path_user_level (str) – Path where the user (speaker)-level output csv file is to be generated. (This parameter will override the base name.)

          • -
          • output_file_path_conv_level (str) – Path where the conversation-level output csv file is to be generated. (This parameter will override the base name.)

          • -
          • custom_features (list, optional) – A list of additional features outside of the default features that should be calculated. -Defaults to an empty list (i.e., no additional features beyond the defaults will be computed).

          • -
          • analyze_first_pct (list(float), optional) – Analyze the first X% of the data. This parameter is useful because the earlier stages of the conversation may be more predictive than the later stages. Thus, researchers may wish to analyze only the first X% of the conversation data and compare the performance with using the full dataset. Defaults to [1.0].

          • -
          • turns (bool, optional) – If true, collapses multiple “chats”/messages by the same speaker in a row into a single “turn.” Defaults to False.

          • -
          • conversation_id_col (str, optional) – A string representing the column name that should be selected as the conversation ID. Defaults to “conversation_num”.

          • -
          • speaker_id_col (str, optional) – A string representing the column name that should be selected as the speaker ID. Defaults to “speaker_nickname”.

          • -
          • message_col (str, optional) – A string representing the column name that should be selected as the message. Defaults to “message”.

          • -
          • timestamp_col (str, optional) – A string representing the column name that should be selected as the message. Defaults to “timestamp”.

          • -
          • timestamp_unit (str, optional) – A string representing the unit of the timestamp (if the timestamp is numeric). Default to the unit ‘ms’ (milliseconds). Other options (D,s,ms,us,ns) can be found on the Pandas reference: https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html

          • -
          • grouping_keys (list, optional) – A list of multiple identifiers that collectively identify a conversation. If non-empty, we will group by all of the keys in the list and use the grouped key as the unique “conversational identifier.” -Defaults to an empty list.

          • -
          • cumulative_grouping (bool, optional) – If true, uses a cumulative way of grouping chats (not just looking within a single ID, but also at what happened before.) -NOTE: This parameter and the following one (within_grouping) was created in the context of a multi-stage Empirica game (see: https://github.com/Watts-Lab/multi-task-empirica). -It assumes that there are exactly 3 nested columns at different levels: a High, Mid, and Low level; further, it assumes that these levels are temporally nested: that is, each -group/conversation has one High-level identifier, which contains one or more Mid-level identifiers, which contains one or more Low-level identifiers. -Defaults to False.

          • -
          • within_task (bool, optional) – If true, groups cumulatively in such a way that we only look at prior chats that are of the same “task” (Mid-level identifier). Defaults to False.

          • -
          • ner_training_df (pd.DataFrame) – This is a pandas dataframe of training data for named entity recognition feature. Defaults to None, and will not generate named entity featuers if it does not exist.

          • -
          • ner_cutoff (int) – This is the cutoff value for the confidence of prediction for each named entity. Defaults to 0.9.

          • -
          • regenerate_vectors (bool, optional) – If true, will regenerate vector data even if it already exists. Defaults to False.

          • -
          • compute_vectors_from_preprocessed (bool, optional) – If true, computes vectors using preprocessed text (that is, with capitalization and punctuation removed). This was the default behavior for v.0.1.3 and earlier, but we now default to computing metrics on the unpreprocessed text (which INCLUDES capitalization and punctuation). Defaults to False.

          • +
          • vector_directory (str) – Directory path where the vectors are to be cached. Defaults to “./vector_data/”.

          • +
          • output_file_base (str) – Base name for the output files, used to auto-generate filenames for each +of the three levels. Defaults to “output.”

          • +
          • output_file_path_chat_level (str) – Path where the chat (utterance)-level output csv file is +to be generated. This parameter will override the base name.

          • +
          • output_file_path_user_level (str) – Path where the user (speaker)-level output csv file is +to be generated. This parameter will override the base name.

          • +
          • output_file_path_conv_level (str) – Path where the conversation-level output csv file is to be +generated. This parameter will override the base name.

          • +
          • custom_features (list, optional) – A list of additional features outside of the default features that should +be calculated. Defaults to an empty list (i.e., no additional features beyond the defaults will +be computed).

          • +
          • analyze_first_pct (list(float), optional) – Analyze the first X% of the data. This parameter is useful because the +earlier stages of the conversation may be more predictive than the later stages. Defaults to [1.0].

          • +
          • turns (bool, optional) – If true, collapses multiple “chats”/messages by the same speaker in a row into a +single “turn.” Defaults to False.

          • +
          • conversation_id_col (str, optional) – A string representing the column name that should be selected as +the conversation ID. Defaults to “conversation_num”.

          • +
          • speaker_id_col (str, optional) – A string representing the column name that should be selected as the speaker ID. +Defaults to “speaker_nickname”.

          • +
          • message_col (str, optional) – A string representing the column name that should be selected as the message. +Defaults to “message”.

          • +
          • timestamp_col (str, optional) – A string representing the column name that should be selected as the message. +Defaults to “timestamp”.

          • +
          • timestamp_unit (str, optional) – A string representing the unit of the timestamp (if the timestamp is numeric). +Defaults to ‘ms’ (milliseconds). Other options (D, s, ms, us, ns) can be found on the Pandas +reference: https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html

          • +
          • grouping_keys (list, optional) – A list of multiple identifiers that collectively identify a conversation. If +non-empty, the data will be grouped by all keys in the list and use the grouped key as the unique +“conversational identifier.”

          • +
          • cumulative_grouping (bool, optional) – If true, uses a cumulative way of grouping chats (looking not just within +a single ID, but also at what happened before). NOTE: This parameter and the following one +(within_grouping) were created in the context of a multi-stage Empirica game (see: +https://github.com/Watts-Lab/multi-task-empirica). Assumes exactly 3 nested columns at different +levels: a High, Mid, and Low level; that are temporally nested. Defaults to False.

          • +
          • within_task (bool, optional) – If true, groups cumulatively such that only prior chats of the same “task” +(Mid-level identifier) are considered. Defaults to False.

          • +
          • ner_training_df (pd.DataFrame, optional) – A pandas DataFrame of training data for named entity recognition features. +Defaults to None and will not generate named entity features if it does not exist.

          • +
          • ner_cutoff (int) – The cutoff value for the confidence of prediction for each named entity. +Defaults to 0.9.

          • +
          • regenerate_vectors (bool, optional) – If true, regenerates vector data even if it already exists. Defaults to False.

          • +
          • compute_vectors_from_preprocessed (bool, optional) – If true, computes vectors using preprocessed text (with +capitalization and punctuation removed). Defaults to False.

          • +
          • convo_aggregation (bool, optional) – If true, aggregates features at the conversational level. Defaults to True.

          • +
          • convo_methods (list, optional) – Specifies which aggregation functions (e.g., mean, stdev) to use at the +conversational level. Defaults to [‘mean’, ‘max’, ‘min’, ‘stdev’].

          • +
          • convo_columns (list, optional) – Specifies which columns (at the utterance/chat level) to aggregate for the +conversational level. Defaults to all numeric columns.

          • +
          • user_aggregation (bool, optional) – If true, aggregates features at the speaker/user level. Defaults to True.

          • +
          • user_methods (list, optional) – Specifies which functions to aggregate with (e.g., mean, stdev) at the user level. +Defaults to [‘mean’, ‘max’, ‘min’, ‘stdev’].

          • +
          • user_columns (list, optional) – Specifies which columns (at the utterance/chat level) to aggregate for the +speaker/user level. Defaults to all numeric columns.

          Returns:
          -

          The FeatureBuilder doesn’t return anything; instead, it writes the generated features to files in the specified paths. It will also print out its progress, so you should see “All Done!” in the terminal, which will indicate that the features have been generated.

          +

          The FeatureBuilder writes the generated features to files in the specified paths. The progress +will be printed in the terminal, indicating completion with “All Done!”.

          Return type:

          None

          diff --git a/docs/build/html/features/burstiness.html b/docs/build/html/features/burstiness.html index 61daadd7..621048c4 100644 --- a/docs/build/html/features/burstiness.html +++ b/docs/build/html/features/burstiness.html @@ -52,18 +52,18 @@
        • Features: Technical Documentation
        • Speaker- (User) Level Features

          User-level features generally represent an aggregation of features at the utterance- level (for example, the average number of words spoken by a particular user). There is therefore limited speaker-level feature documentation, other than a function used to compute the “network” of other speakers that an individual interacts with in a conversation.

          -

          You may reference the Speaker (User)-Level Features Page for more information.

          +

          You may reference the Speaker (User)-Level Features Page for more information, as well as the details in the Worked Example (see Custom Aggregation).

          • get_user_network module
          • diff --git a/docs/build/html/features/information_diversity.html b/docs/build/html/features/information_diversity.html index df932cd1..f6a9d846 100644 --- a/docs/build/html/features/information_diversity.html +++ b/docs/build/html/features/information_diversity.html @@ -52,20 +52,18 @@
          • Features: Technical Documentation
            • Utterance- (Chat) Level Features
            • Conversation-Level Features
                -
              • burstiness module
              • -
              • information_diversity module
              • Speaker- (User) Level Features
              • diff --git a/docs/build/html/features/turn_taking_features.html b/docs/build/html/features/turn_taking_features.html index bd798a8c..05f45fc1 100644 --- a/docs/build/html/features/turn_taking_features.html +++ b/docs/build/html/features/turn_taking_features.html @@ -52,19 +52,18 @@
              • Features: Technical Documentation
                • Utterance- (Chat) Level Features
                • Conversation-Level Features
                    -
                  • burstiness module
                  • -
                  • information_diversity module
                  • -
                  • gini_coefficient module
                  • -
                  • get_all_DD_features module
                  • -
                  • discursive_diversity module
                  • -
                  • variance_in_DD module
                  • -
                  • within_person_discursive_range module
                  • -
                  • turn_taking_features module
                  • Speaker- (User) Level Features
                  • diff --git a/docs/build/html/features/variance_in_DD.html b/docs/build/html/features/variance_in_DD.html index 09925a44..e4c14026 100644 --- a/docs/build/html/features/variance_in_DD.html +++ b/docs/build/html/features/variance_in_DD.html @@ -52,17 +52,18 @@
                  • Features: Technical Documentation
                    • Utterance- (Chat) Level Features
                    • Conversation-Level Features
                        -
                      • burstiness module
                      • -
                      • information_diversity module
                      • -
                      • gini_coefficient module
                      • -
                      • get_all_DD_features module
                      • -
                      • discursive_diversity module
                      • -
                      • variance_in_DD module
                      • Speaker- (User) Level Features
                      • diff --git a/docs/build/html/features/within_person_discursive_range.html b/docs/build/html/features/within_person_discursive_range.html index c9197662..b4f48d8c 100644 --- a/docs/build/html/features/within_person_discursive_range.html +++ b/docs/build/html/features/within_person_discursive_range.html @@ -52,18 +52,18 @@
                      • Features: Technical Documentation
                        • Utterance- (Chat) Level Features
                        • Conversation-Level Features - +
                        • +
                        • get_user_stdev_dataframe() (in module utils.summarize_features) +
                        • get_user_sum_dataframe() (in module utils.summarize_features)
                        • get_variance_in_DD() (in module features.variance_in_DD) diff --git a/docs/build/html/index.html b/docs/build/html/index.html index de04f500..bc64b98c 100644 --- a/docs/build/html/index.html +++ b/docs/build/html/index.html @@ -215,8 +215,6 @@

                          Table of ContentsWorked Example

                        • Features: Technical Documentation
                            diff --git a/docs/build/html/objects.inv b/docs/build/html/objects.inv index afee53e1..ef0eaeb4 100644 Binary files a/docs/build/html/objects.inv and b/docs/build/html/objects.inv differ diff --git a/docs/build/html/searchindex.js b/docs/build/html/searchindex.js index 2410d5e8..74af2673 100644 --- a/docs/build/html/searchindex.js +++ b/docs/build/html/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"A Light-Touch, One-Function Package": [[0, "a-light-touch-one-function-package"]], "Additional FeatureBuilder Considerations": [[1, "additional-featurebuilder-considerations"]], "Advanced Configuration Columns": [[1, "advanced-configuration-columns"]], "Basic Input Columns": [[1, "basic-input-columns"]], "Certainty": [[30, null]], "Citation": [[29, "citation"], [30, "citation"], [31, "citation"], [32, "citation"], [33, "citation"], [34, "citation"], [35, "citation"], [36, "citation"], [37, "citation"], [38, "citation"], [40, "citation"], [41, "citation"], [42, "citation"], [43, "citation"], [44, "citation"], [45, "citation"], [46, "citation"], [47, "citation"], [48, "citation"], [49, "citation"], [50, "citation"], [51, "citation"], [52, "citation"], [53, "citation"], [54, "citation"], [55, "citation"], [56, "citation"], [57, "citation"], [58, "citation"], [59, "citation"], [60, "citation"]], "Configuring the FeatureBuilder": [[1, "configuring-the-featurebuilder"]], "Content Word Accommodation": [[31, null]], "Contents:": [[61, null]], "Conversation Parameters": [[1, "conversation-parameters"]], "Conversation-Level Features": [[11, "conversation-level-features"], [39, "conversation-level-features"]], "Conversational Repair": [[32, null]], "Customizable Parameters": [[0, "customizable-parameters"]], "Dale-Chall Score": [[33, null]], "Declaring a FeatureBuilder": [[61, "declaring-a-featurebuilder"]], "Demo / Sample Code": [[0, "demo-sample-code"], [1, "demo-sample-code"]], "Discursive Diversity": [[34, null]], "Example:": [[41, "example"]], "FEATURE NAME": [[29, null]], "Feature Column Names": [[1, "feature-column-names"], [61, "feature-column-names"]], "Feature Documentation": [[62, "feature-documentation"]], "Feature Information": [[1, "feature-information"], [61, "feature-information"]], "Features: Conceptual Documentation": [[39, null]], "Features: Technical Documentation": [[11, null]], "Forward Flow": [[35, null]], "Function Word Accommodation": [[36, null]], "Generating Features: Utterance-, Speaker-, and Conversation-Level": [[62, "generating-features-utterance-speaker-and-conversation-level"]], "Getting Started": [[1, "getting-started"], [61, "getting-started"], [62, "getting-started"]], "Gini Coefficient": [[37, null]], "Hedge": [[38, null]], "High*Level Intuition": [[54, "high-level-intuition"]], "High-Level Intuition": [[29, "high-level-intuition"], [30, "high-level-intuition"], [31, "high-level-intuition"], [32, "high-level-intuition"], [33, "high-level-intuition"], [34, "high-level-intuition"], [35, "high-level-intuition"], [36, "high-level-intuition"], [37, "high-level-intuition"], [38, "high-level-intuition"], [40, "high-level-intuition"], [41, "high-level-intuition"], [42, "high-level-intuition"], [43, "high-level-intuition"], [44, "high-level-intuition"], [45, "high-level-intuition"], [46, "high-level-intuition"], [47, "high-level-intuition"], [48, "high-level-intuition"], [49, "high-level-intuition"], [50, "high-level-intuition"], [51, "high-level-intuition"], [52, "high-level-intuition"], [53, "high-level-intuition"], [55, "high-level-intuition"], [56, "high-level-intuition"], [57, "high-level-intuition"], [58, "high-level-intuition"], [59, "high-level-intuition"], [60, "high-level-intuition"]], "Implementation": [[32, "implementation"], [42, "implementation"], [52, "implementation"], [54, "implementation"]], "Implementation Basics": [[29, "implementation-basics"], [30, "implementation-basics"], [31, "implementation-basics"], [33, "implementation-basics"], [34, "implementation-basics"], [35, "implementation-basics"], [36, "implementation-basics"], [37, "implementation-basics"], [38, "implementation-basics"], [40, "implementation-basics"], [41, "implementation-basics"], [43, "implementation-basics"], [44, "implementation-basics"], [45, "implementation-basics"], [46, "implementation-basics"], [47, "implementation-basics"], [48, "implementation-basics"], [49, "implementation-basics"], [50, "implementation-basics"], [51, "implementation-basics"], [53, "implementation-basics"], [55, "implementation-basics"], [56, "implementation-basics"], [57, "implementation-basics"], [58, "implementation-basics"], [59, "implementation-basics"], [60, "implementation-basics"]], "Implementation Notes/Caveats": [[29, "implementation-notes-caveats"], [30, "implementation-notes-caveats"], [31, "implementation-notes-caveats"], [33, "implementation-notes-caveats"], [34, "implementation-notes-caveats"], [35, "implementation-notes-caveats"], [36, "implementation-notes-caveats"], [38, "implementation-notes-caveats"], [40, "implementation-notes-caveats"], [41, "implementation-notes-caveats"], [43, "implementation-notes-caveats"], [44, "implementation-notes-caveats"], [45, "implementation-notes-caveats"], [46, "implementation-notes-caveats"], [47, "implementation-notes-caveats"], [48, "implementation-notes-caveats"], [49, "implementation-notes-caveats"], [50, "implementation-notes-caveats"], [51, "implementation-notes-caveats"], [53, "implementation-notes-caveats"], [55, "implementation-notes-caveats"], [56, "implementation-notes-caveats"], [57, "implementation-notes-caveats"], [58, "implementation-notes-caveats"], [59, "implementation-notes-caveats"]], "Import Recommendations: Virtual Environment and Pip": [[1, "import-recommendations-virtual-environment-and-pip"], [61, "import-recommendations-virtual-environment-and-pip"]], "Importing the Package": [[1, "importing-the-package"]], "Indices and Tables": [[61, "indices-and-tables"]], "Information Diversity": [[40, null]], "Information Exchange": [[41, null]], "Input File": [[34, "id2"]], "Inspecting Generated Features": [[1, "inspecting-generated-features"], [61, "inspecting-generated-features"]], "Interpretation:": [[41, "interpretation"]], "Interpreting the Feature": [[29, "interpreting-the-feature"], [30, "interpreting-the-feature"], [31, "interpreting-the-feature"], [32, "interpreting-the-feature"], [33, "interpreting-the-feature"], [34, "interpreting-the-feature"], [35, "interpreting-the-feature"], [36, "interpreting-the-feature"], [37, "interpreting-the-feature"], [38, "interpreting-the-feature"], [40, "interpreting-the-feature"], [41, "interpreting-the-feature"], [42, "interpreting-the-feature"], [43, "interpreting-the-feature"], [44, "interpreting-the-feature"], [45, "interpreting-the-feature"], [46, "interpreting-the-feature"], [47, "interpreting-the-feature"], [48, "interpreting-the-feature"], [49, "interpreting-the-feature"], [50, "interpreting-the-feature"], [51, "interpreting-the-feature"], [52, "interpreting-the-feature"], [53, "interpreting-the-feature"], [54, "interpreting-the-feature"], [55, "interpreting-the-feature"], [56, "interpreting-the-feature"], [57, "interpreting-the-feature"], [58, "interpreting-the-feature"], [59, "interpreting-the-feature"], [60, "interpreting-the-feature"]], "Introduction": [[62, null]], "Key Assumptions and Parameters": [[0, "key-assumptions-and-parameters"]], "Linguistic Inquiry and Word Count (LIWC) and Other Lexicons": [[42, null]], "Message Length": [[43, null]], "Message Quantity": [[44, null]], "Mimicry (BERT)": [[45, null]], "Motivation": [[62, "motivation"]], "Moving Mimicry": [[46, null]], "Named Entity Recognition": [[47, null]], "Named Entity Training Examples": [[47, "id2"]], "Online Discussion Tags": [[48, null]], "Other Utilities": [[69, "other-utilities"]], "Ouput File": [[34, "id3"]], "Our Team": [[62, "our-team"]], "Output File": [[30, "id2"], [35, "id2"], [45, "id2"], [46, "id2"], [47, "id3"], [51, "id1"]], "Output File Naming Details": [[1, "output-file-naming-details"]], "Package Assumptions": [[0, "package-assumptions"]], "Politeness Strategies": [[50, null]], "Politeness/Receptiveness Markers": [[49, null]], "Positivity Z-Score": [[52, null]], "Proportion of First Person Pronouns": [[53, null]], "Question (Naive)": [[54, null]], "Related Features": [[29, "related-features"], [30, "related-features"], [31, "related-features"], [32, "related-features"], [33, "related-features"], [34, "related-features"], [35, "related-features"], [36, "related-features"], [37, "related-features"], [38, "related-features"], [40, "related-features"], [41, "related-features"], [42, "related-features"], [43, "related-features"], [44, "related-features"], [45, "related-features"], [46, "related-features"], [47, "related-features"], [48, "related-features"], [49, "related-features"], [50, "related-features"], [51, "related-features"], [52, "related-features"], [53, "related-features"], [54, "related-features"], [55, "related-features"], [56, "related-features"], [57, "related-features"], [58, "related-features"], [59, "related-features"], [60, "related-features"]], "Sentiment (RoBERTa)": [[51, null]], "Speaker Turn Counts": [[59, "id2"]], "Speaker- (User) Level Features": [[11, "speaker-user-level-features"]], "Table of Contents": [[61, "table-of-contents"]], "Team Burstiness": [[55, null]], "Textblob Polarity": [[56, null]], "Textblob Subjectivity": [[57, null]], "The Basics (Get Started Here!)": [[0, null]], "The FeatureBuilder": [[62, "the-featurebuilder"]], "The Team Communication Toolkit": [[61, null]], "Time Difference": [[58, null]], "Troubleshooting": [[1, "troubleshooting"], [61, "troubleshooting"]], "Turn Taking Index": [[59, null]], "Turns": [[1, "turns"]], "Using the Package": [[61, "using-the-package"]], "Utilities": [[69, null]], "Utterance- (Chat) Level Features": [[11, "utterance-chat-level-features"], [39, "utterance-chat-level-features"]], "Vector Directory": [[1, "vector-directory"]], "Walkthrough: Running the FeatureBuilder on Your Data": [[1, "walkthrough-running-the-featurebuilder-on-your-data"]], "Word Type-Token Ratio": [[60, null]], "Worked Example": [[1, null]], "assign_chunk_nums module": [[63, null]], "basic_features module": [[3, null]], "burstiness module": [[4, null]], "calculate_chat_level_features module": [[64, null]], "calculate_conversation_level_features module": [[65, null]], "calculate_user_level_features module": [[66, null]], "certainty module": [[5, null]], "check_embeddings module": [[67, null]], "discursive_diversity module": [[6, null]], "feature_builder module": [[2, null]], "fflow module": [[7, null]], "get_all_DD_features module": [[8, null]], "get_user_network module": [[9, null]], "gini_coefficient module": [[68, null]], "hedge module": [[10, null]], "info_exchange_zscore module": [[12, null]], "information_diversity module": [[13, null]], "lexical_features_v2 module": [[14, null]], "named_entity_recognition_features module": [[15, null]], "other_lexical_features module": [[16, null]], "politeness_features module": [[17, null]], "politeness_v2 module": [[18, null]], "politeness_v2_helper module": [[19, null]], "preload_word_lists module": [[70, null]], "preprocess module": [[71, null]], "question_num module": [[20, null]], "readability module": [[21, null]], "reddit_tags module": [[22, null]], "summarize_features module": [[72, null]], "temporal_features module": [[23, null]], "textblob_sentiment_analysis module": [[24, null]], "turn_taking_features module": [[25, null]], "variance_in_DD module": [[26, null]], "within_person_discursive_range module": [[27, null]], "word_mimicry module": [[28, null]], "z-scores:": [[41, "z-scores"]], "zscore_chats_and_conversation module": [[73, null]], "\u201cDriver\u201d Classes: Utterance-, Conversation-, and Speaker-Level Features": [[69, "driver-classes-utterance-conversation-and-speaker-level-features"]]}, "docnames": ["basics", "examples", "feature_builder", "features/basic_features", "features/burstiness", "features/certainty", "features/discursive_diversity", "features/fflow", "features/get_all_DD_features", "features/get_user_network", "features/hedge", "features/index", "features/info_exchange_zscore", "features/information_diversity", "features/lexical_features_v2", "features/named_entity_recognition_features", "features/other_lexical_features", "features/politeness_features", "features/politeness_v2", "features/politeness_v2_helper", "features/question_num", "features/readability", "features/reddit_tags", "features/temporal_features", "features/textblob_sentiment_analysis", "features/turn_taking_features", "features/variance_in_DD", "features/within_person_discursive_range", "features/word_mimicry", "features_conceptual/TEMPLATE", "features_conceptual/certainty", "features_conceptual/content_word_accommodation", "features_conceptual/conversational_repair", "features_conceptual/dale_chall_score", "features_conceptual/discursive_diversity", "features_conceptual/forward_flow", "features_conceptual/function_word_accommodation", "features_conceptual/gini_coefficient", "features_conceptual/hedge", "features_conceptual/index", "features_conceptual/information_diversity", "features_conceptual/information_exchange", "features_conceptual/liwc", "features_conceptual/message_length", "features_conceptual/message_quantity", "features_conceptual/mimicry_bert", "features_conceptual/moving_mimicry", "features_conceptual/named_entity_recognition", "features_conceptual/online_discussions_tags", "features_conceptual/politeness_receptiveness_markers", "features_conceptual/politeness_strategies", "features_conceptual/positivity_bert", "features_conceptual/positivity_z_score", "features_conceptual/proportion_of_first_person_pronouns", "features_conceptual/questions", "features_conceptual/team_burstiness", "features_conceptual/textblob_polarity", "features_conceptual/textblob_subjectivity", "features_conceptual/time_difference", "features_conceptual/turn_taking_index", "features_conceptual/word_ttr", "index", "intro", "utils/assign_chunk_nums", "utils/calculate_chat_level_features", "utils/calculate_conversation_level_features", "utils/calculate_user_level_features", "utils/check_embeddings", "utils/gini_coefficient", "utils/index", "utils/preload_word_lists", "utils/preprocess", "utils/summarize_features", "utils/zscore_chats_and_conversation"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["basics.rst", "examples.rst", "feature_builder.rst", "features/basic_features.rst", "features/burstiness.rst", "features/certainty.rst", "features/discursive_diversity.rst", "features/fflow.rst", "features/get_all_DD_features.rst", "features/get_user_network.rst", "features/hedge.rst", "features/index.rst", "features/info_exchange_zscore.rst", "features/information_diversity.rst", "features/lexical_features_v2.rst", "features/named_entity_recognition_features.rst", "features/other_lexical_features.rst", "features/politeness_features.rst", "features/politeness_v2.rst", "features/politeness_v2_helper.rst", "features/question_num.rst", "features/readability.rst", "features/reddit_tags.rst", "features/temporal_features.rst", "features/textblob_sentiment_analysis.rst", "features/turn_taking_features.rst", "features/variance_in_DD.rst", "features/within_person_discursive_range.rst", "features/word_mimicry.rst", "features_conceptual/TEMPLATE.rst", "features_conceptual/certainty.rst", "features_conceptual/content_word_accommodation.rst", "features_conceptual/conversational_repair.rst", "features_conceptual/dale_chall_score.rst", "features_conceptual/discursive_diversity.rst", "features_conceptual/forward_flow.rst", "features_conceptual/function_word_accommodation.rst", "features_conceptual/gini_coefficient.rst", "features_conceptual/hedge.rst", "features_conceptual/index.rst", "features_conceptual/information_diversity.rst", "features_conceptual/information_exchange.rst", "features_conceptual/liwc.rst", "features_conceptual/message_length.rst", "features_conceptual/message_quantity.rst", "features_conceptual/mimicry_bert.rst", "features_conceptual/moving_mimicry.rst", "features_conceptual/named_entity_recognition.rst", "features_conceptual/online_discussions_tags.rst", "features_conceptual/politeness_receptiveness_markers.rst", "features_conceptual/politeness_strategies.rst", "features_conceptual/positivity_bert.rst", "features_conceptual/positivity_z_score.rst", "features_conceptual/proportion_of_first_person_pronouns.rst", "features_conceptual/questions.rst", "features_conceptual/team_burstiness.rst", "features_conceptual/textblob_polarity.rst", "features_conceptual/textblob_subjectivity.rst", "features_conceptual/time_difference.rst", "features_conceptual/turn_taking_index.rst", "features_conceptual/word_ttr.rst", "index.rst", "intro.rst", "utils/assign_chunk_nums.rst", "utils/calculate_chat_level_features.rst", "utils/calculate_conversation_level_features.rst", "utils/calculate_user_level_features.rst", "utils/check_embeddings.rst", "utils/gini_coefficient.rst", "utils/index.rst", "utils/preload_word_lists.rst", "utils/preprocess.rst", "utils/summarize_features.rst", "utils/zscore_chats_and_conversation.rst"], "indexentries": {"adverb_limiter() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.adverb_limiter", false]], "assert_key_columns_present() (in module utils.preprocess)": [[71, "utils.preprocess.assert_key_columns_present", false]], "assign_chunk_nums() (in module utils.assign_chunk_nums)": [[63, "utils.assign_chunk_nums.assign_chunk_nums", false]], "bare_command() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.bare_command", false]], "built_spacy_ner() (in module features.named_entity_recognition_features)": [[15, "features.named_entity_recognition_features.built_spacy_ner", false]], "burstiness() (in module features.burstiness)": [[4, "features.burstiness.burstiness", false]], "calculate_chat_level_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_chat_level_features", false]], "calculate_conversation_level_features() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.calculate_conversation_level_features", false]], "calculate_hedge_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_hedge_features", false]], "calculate_id_score() (in module features.information_diversity)": [[13, "features.information_diversity.calculate_ID_score", false]], "calculate_info_diversity() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.calculate_info_diversity", false]], "calculate_named_entities() (in module features.named_entity_recognition_features)": [[15, "features.named_entity_recognition_features.calculate_named_entities", false]], "calculate_num_question_naive() (in module features.question_num)": [[20, "features.question_num.calculate_num_question_naive", false]], "calculate_politeness_sentiment() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_politeness_sentiment", false]], "calculate_politeness_v2() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_politeness_v2", false]], "calculate_team_burstiness() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.calculate_team_burstiness", false]], "calculate_textblob_sentiment() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_textblob_sentiment", false]], "calculate_user_level_features() (utils.calculate_user_level_features.userlevelfeaturescalculator method)": [[66, "utils.calculate_user_level_features.UserLevelFeaturesCalculator.calculate_user_level_features", false]], "calculate_vector_word_mimicry() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_vector_word_mimicry", false]], "calculate_word_mimicry() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_word_mimicry", false]], "chat_level_features() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.chat_level_features", false]], "chatlevelfeaturescalculator (class in utils.calculate_chat_level_features)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator", false]], "check_embeddings() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.check_embeddings", false]], "classify_ntri() (in module features.other_lexical_features)": [[16, "features.other_lexical_features.classify_NTRI", false]], "classify_text_dalechall() (in module features.readability)": [[21, "features.readability.classify_text_dalechall", false]], "clean_text() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.clean_text", false]], "coerce_to_date_or_number() (in module features.temporal_features)": [[23, "features.temporal_features.coerce_to_date_or_number", false]], "commit_data() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.commit_data", false]], "compress() (in module utils.preprocess)": [[71, "utils.preprocess.compress", false]], "compute_frequency() (in module features.word_mimicry)": [[28, "features.word_mimicry.compute_frequency", false]], "compute_frequency_per_conv() (in module features.word_mimicry)": [[28, "features.word_mimicry.compute_frequency_per_conv", false]], "computetf() (in module features.word_mimicry)": [[28, "features.word_mimicry.computeTF", false]], "concat_bert_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.concat_bert_features", false]], "conjection_seperator() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.conjection_seperator", false]], "content_mimicry_score() (in module features.word_mimicry)": [[28, "features.word_mimicry.Content_mimicry_score", false]], "content_mimicry_score_per_conv() (in module features.word_mimicry)": [[28, "features.word_mimicry.Content_mimicry_score_per_conv", false]], "conv_level_features() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.conv_level_features", false]], "conv_to_float_arr() (in module features.get_all_dd_features)": [[8, "features.get_all_DD_features.conv_to_float_arr", false]], "conversationlevelfeaturescalculator (class in utils.calculate_conversation_level_features)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator", false]], "count_all_caps() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_all_caps", false]], "count_bullet_points() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_bullet_points", false]], "count_characters() (in module features.basic_features)": [[3, "features.basic_features.count_characters", false]], "count_difficult_words() (in module features.readability)": [[21, "features.readability.count_difficult_words", false]], "count_ellipses() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_ellipses", false]], "count_emojis() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_emojis", false]], "count_emphasis() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_emphasis", false]], "count_line_breaks() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_line_breaks", false]], "count_links() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_links", false]], "count_matches() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.count_matches", false]], "count_messages() (in module features.basic_features)": [[3, "features.basic_features.count_messages", false]], "count_numbering() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_numbering", false]], "count_parentheses() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_parentheses", false]], "count_quotes() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_quotes", false]], "count_responding_to_someone() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_responding_to_someone", false]], "count_spacy_matches() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.count_spacy_matches", false]], "count_syllables() (in module features.readability)": [[21, "features.readability.count_syllables", false]], "count_turn_taking_index() (in module features.turn_taking_features)": [[25, "features.turn_taking_features.count_turn_taking_index", false]], "count_turns() (in module features.turn_taking_features)": [[25, "features.turn_taking_features.count_turns", false]], "count_user_references() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_user_references", false]], "count_words() (in module features.basic_features)": [[3, "features.basic_features.count_words", false]], "create_chunks() (in module utils.assign_chunk_nums)": [[63, "utils.assign_chunk_nums.create_chunks", false]], "create_chunks_messages() (in module utils.assign_chunk_nums)": [[63, "utils.assign_chunk_nums.create_chunks_messages", false]], "create_cumulative_rows() (in module utils.preprocess)": [[71, "utils.preprocess.create_cumulative_rows", false]], "dale_chall_helper() (in module features.readability)": [[21, "features.readability.dale_chall_helper", false]], "feat_counts() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.feat_counts", false]], "feature_builder": [[2, "module-feature_builder", false]], "featurebuilder (class in feature_builder)": [[2, "feature_builder.FeatureBuilder", false]], "features.basic_features": [[3, "module-features.basic_features", false]], "features.burstiness": [[4, "module-features.burstiness", false]], "features.certainty": [[5, "module-features.certainty", false]], "features.discursive_diversity": [[6, "module-features.discursive_diversity", false]], "features.fflow": [[7, "module-features.fflow", false]], "features.get_all_dd_features": [[8, "module-features.get_all_DD_features", false]], "features.get_user_network": [[9, "module-features.get_user_network", false]], "features.hedge": [[10, "module-features.hedge", false]], "features.info_exchange_zscore": [[12, "module-features.info_exchange_zscore", false]], "features.information_diversity": [[13, "module-features.information_diversity", false]], "features.lexical_features_v2": [[14, "module-features.lexical_features_v2", false]], "features.named_entity_recognition_features": [[15, "module-features.named_entity_recognition_features", false]], "features.other_lexical_features": [[16, "module-features.other_lexical_features", false]], "features.politeness_features": [[17, "module-features.politeness_features", false]], "features.politeness_v2": [[18, "module-features.politeness_v2", false]], "features.politeness_v2_helper": [[19, "module-features.politeness_v2_helper", false]], "features.question_num": [[20, "module-features.question_num", false]], "features.readability": [[21, "module-features.readability", false]], "features.reddit_tags": [[22, "module-features.reddit_tags", false]], "features.temporal_features": [[23, "module-features.temporal_features", false]], "features.textblob_sentiment_analysis": [[24, "module-features.textblob_sentiment_analysis", false]], "features.turn_taking_features": [[25, "module-features.turn_taking_features", false]], "features.variance_in_dd": [[26, "module-features.variance_in_DD", false]], "features.within_person_discursive_range": [[27, "module-features.within_person_discursive_range", false]], "features.word_mimicry": [[28, "module-features.word_mimicry", false]], "featurize() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.featurize", false]], "function_mimicry_score() (in module features.word_mimicry)": [[28, "features.word_mimicry.function_mimicry_score", false]], "generate_bert() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.generate_bert", false]], "generate_certainty_pkl() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.generate_certainty_pkl", false]], "generate_lexicon_pkl() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.generate_lexicon_pkl", false]], "generate_vect() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.generate_vect", false]], "get_average() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_average", false]], "get_centroids() (utils.calculate_user_level_features.userlevelfeaturescalculator method)": [[66, "utils.calculate_user_level_features.UserLevelFeaturesCalculator.get_centroids", false]], "get_certainty() (in module features.certainty)": [[5, "features.certainty.get_certainty", false]], "get_certainty_score() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.get_certainty_score", false]], "get_content_words_in_message() (in module features.word_mimicry)": [[28, "features.word_mimicry.get_content_words_in_message", false]], "get_conversation_level_aggregates() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.get_conversation_level_aggregates", false]], "get_cosine_similarity() (in module features.discursive_diversity)": [[6, "features.discursive_diversity.get_cosine_similarity", false]], "get_dale_chall_easy_words() (in module utils.preload_word_lists)": [[70, "utils.preload_word_lists.get_dale_chall_easy_words", false]], "get_dale_chall_score_and_classfication() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.get_dale_chall_score_and_classfication", false]], "get_dd() (in module features.discursive_diversity)": [[6, "features.discursive_diversity.get_DD", false]], "get_dd_features() (in module features.get_all_dd_features)": [[8, "features.get_all_DD_features.get_DD_features", false]], "get_dep_pairs() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.get_dep_pairs", false]], "get_dep_pairs_noneg() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.get_dep_pairs_noneg", false]], "get_discursive_diversity_features() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.get_discursive_diversity_features", false]], "get_first_pct_of_chat() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.get_first_pct_of_chat", false]], "get_first_person_words() (in module utils.preload_word_lists)": [[70, "utils.preload_word_lists.get_first_person_words", false]], "get_forward_flow() (in module features.fflow)": [[7, "features.fflow.get_forward_flow", false]], "get_forward_flow() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.get_forward_flow", false]], "get_function_words() (in module utils.preload_word_lists)": [[70, "utils.preload_word_lists.get_function_words", false]], "get_function_words_in_message() (in module features.word_mimicry)": [[28, "features.word_mimicry.get_function_words_in_message", false]], "get_gini() (in module utils.gini_coefficient)": [[68, "utils.gini_coefficient.get_gini", false]], "get_gini_features() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.get_gini_features", false]], "get_info_diversity() (in module features.information_diversity)": [[13, "features.information_diversity.get_info_diversity", false]], "get_info_exchange_wordcount() (in module features.info_exchange_zscore)": [[12, "features.info_exchange_zscore.get_info_exchange_wordcount", false]], "get_liwc_count() (in module features.lexical_features_v2)": [[14, "features.lexical_features_v2.get_liwc_count", false]], "get_max() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_max", false]], "get_mimicry_bert() (in module features.word_mimicry)": [[28, "features.word_mimicry.get_mimicry_bert", false]], "get_min() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_min", false]], "get_moving_mimicry() (in module features.word_mimicry)": [[28, "features.word_mimicry.get_moving_mimicry", false]], "get_named_entity() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.get_named_entity", false]], "get_nan_vector() (in module features.within_person_discursive_range)": [[27, "features.within_person_discursive_range.get_nan_vector", false]], "get_nan_vector() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.get_nan_vector", false]], "get_polarity_score() (in module features.textblob_sentiment_analysis)": [[24, "features.textblob_sentiment_analysis.get_polarity_score", false]], "get_politeness_strategies() (in module features.politeness_features)": [[17, "features.politeness_features.get_politeness_strategies", false]], "get_politeness_v2() (in module features.politeness_v2)": [[18, "features.politeness_v2.get_politeness_v2", false]], "get_proportion_first_pronouns() (in module features.other_lexical_features)": [[16, "features.other_lexical_features.get_proportion_first_pronouns", false]], "get_question_words() (in module utils.preload_word_lists)": [[70, "utils.preload_word_lists.get_question_words", false]], "get_reddit_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.get_reddit_features", false]], "get_sentiment() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.get_sentiment", false]], "get_stdev() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_stdev", false]], "get_subjectivity_score() (in module features.textblob_sentiment_analysis)": [[24, "features.textblob_sentiment_analysis.get_subjectivity_score", false]], "get_sum() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_sum", false]], "get_team_burstiness() (in module features.burstiness)": [[4, "features.burstiness.get_team_burstiness", false]], "get_temporal_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.get_temporal_features", false]], "get_time_diff() (in module features.temporal_features)": [[23, "features.temporal_features.get_time_diff", false]], "get_time_diff_startend() (in module features.temporal_features)": [[23, "features.temporal_features.get_time_diff_startend", false]], "get_turn() (in module features.turn_taking_features)": [[25, "features.turn_taking_features.get_turn", false]], "get_turn_id() (in module utils.preprocess)": [[71, "utils.preprocess.get_turn_id", false]], "get_turn_taking_features() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.get_turn_taking_features", false]], "get_unique_pairwise_combos() (in module features.discursive_diversity)": [[6, "features.discursive_diversity.get_unique_pairwise_combos", false]], "get_user_average_dataframe() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_user_average_dataframe", false]], "get_user_level_aggregates() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.get_user_level_aggregates", false]], "get_user_level_averaged_features() (utils.calculate_user_level_features.userlevelfeaturescalculator method)": [[66, "utils.calculate_user_level_features.UserLevelFeaturesCalculator.get_user_level_averaged_features", false]], "get_user_level_summary_statistics_features() (utils.calculate_user_level_features.userlevelfeaturescalculator method)": [[66, "utils.calculate_user_level_features.UserLevelFeaturesCalculator.get_user_level_summary_statistics_features", false]], "get_user_level_summed_features() (utils.calculate_user_level_features.userlevelfeaturescalculator method)": [[66, "utils.calculate_user_level_features.UserLevelFeaturesCalculator.get_user_level_summed_features", false]], "get_user_network() (in module features.get_user_network)": [[9, "features.get_user_network.get_user_network", false]], "get_user_network() (utils.calculate_user_level_features.userlevelfeaturescalculator method)": [[66, "utils.calculate_user_level_features.UserLevelFeaturesCalculator.get_user_network", false]], "get_user_sum_dataframe() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_user_sum_dataframe", false]], "get_variance_in_dd() (in module features.variance_in_dd)": [[26, "features.variance_in_DD.get_variance_in_DD", false]], "get_within_person_disc_range() (in module features.within_person_discursive_range)": [[27, "features.within_person_discursive_range.get_within_person_disc_range", false]], "get_word_ttr() (in module features.other_lexical_features)": [[16, "features.other_lexical_features.get_word_TTR", false]], "get_zscore_across_all_chats() (in module utils.zscore_chats_and_conversation)": [[73, "utils.zscore_chats_and_conversation.get_zscore_across_all_chats", false]], "get_zscore_across_all_conversations() (in module utils.zscore_chats_and_conversation)": [[73, "utils.zscore_chats_and_conversation.get_zscore_across_all_conversations", false]], "gini_coefficient() (in module utils.gini_coefficient)": [[68, "utils.gini_coefficient.gini_coefficient", false]], "info_diversity() (in module features.information_diversity)": [[13, "features.information_diversity.info_diversity", false]], "info_exchange() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.info_exchange", false]], "is_hedged_sentence_1() (in module features.hedge)": [[10, "features.hedge.is_hedged_sentence_1", false]], "lexical_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.lexical_features", false]], "liwc_features() (in module features.lexical_features_v2)": [[14, "features.lexical_features_v2.liwc_features", false]], "load_saved_data() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.load_saved_data", false]], "load_to_dict() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.load_to_dict", false]], "load_to_lists() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.load_to_lists", false]], "merge_conv_data_with_original() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.merge_conv_data_with_original", false]], "mimic_words() (in module features.word_mimicry)": [[28, "features.word_mimicry.mimic_words", false]], "module": [[2, "module-feature_builder", false], [3, "module-features.basic_features", false], [4, "module-features.burstiness", false], [5, "module-features.certainty", false], [6, "module-features.discursive_diversity", false], [7, "module-features.fflow", false], [8, "module-features.get_all_DD_features", false], [9, "module-features.get_user_network", false], [10, "module-features.hedge", false], [12, "module-features.info_exchange_zscore", false], [13, "module-features.information_diversity", false], [14, "module-features.lexical_features_v2", false], [15, "module-features.named_entity_recognition_features", false], [16, "module-features.other_lexical_features", false], [17, "module-features.politeness_features", false], [18, "module-features.politeness_v2", false], [19, "module-features.politeness_v2_helper", false], [20, "module-features.question_num", false], [21, "module-features.readability", false], [22, "module-features.reddit_tags", false], [23, "module-features.temporal_features", false], [24, "module-features.textblob_sentiment_analysis", false], [25, "module-features.turn_taking_features", false], [26, "module-features.variance_in_DD", false], [27, "module-features.within_person_discursive_range", false], [28, "module-features.word_mimicry", false], [63, "module-utils.assign_chunk_nums", false], [64, "module-utils.calculate_chat_level_features", false], [65, "module-utils.calculate_conversation_level_features", false], [66, "module-utils.calculate_user_level_features", false], [67, "module-utils.check_embeddings", false], [68, "module-utils.gini_coefficient", false], [70, "module-utils.preload_word_lists", false], [71, "module-utils.preprocess", false], [72, "module-utils.summarize_features", false], [73, "module-utils.zscore_chats_and_conversation", false]], "named_entities() (in module features.named_entity_recognition_features)": [[15, "features.named_entity_recognition_features.named_entities", false]], "num_named_entity() (in module features.named_entity_recognition_features)": [[15, "features.named_entity_recognition_features.num_named_entity", false]], "other_lexical_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.other_lexical_features", false]], "phrase_split() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.phrase_split", false]], "positivity_zscore() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.positivity_zscore", false]], "prep_simple() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.prep_simple", false]], "prep_whole() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.prep_whole", false]], "preprocess_chat_data() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.preprocess_chat_data", false]], "preprocess_conversation_columns() (in module utils.preprocess)": [[71, "utils.preprocess.preprocess_conversation_columns", false]], "preprocess_naive_turns() (in module utils.preprocess)": [[71, "utils.preprocess.preprocess_naive_turns", false]], "preprocess_text() (in module utils.preprocess)": [[71, "utils.preprocess.preprocess_text", false]], "preprocess_text_lowercase_but_retain_punctuation() (in module utils.preprocess)": [[71, "utils.preprocess.preprocess_text_lowercase_but_retain_punctuation", false]], "preprocessing() (in module features.information_diversity)": [[13, "features.information_diversity.preprocessing", false]], "punctuation_seperator() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.punctuation_seperator", false]], "question() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.Question", false]], "read_in_lexicons() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.read_in_lexicons", false]], "reduce_chunks() (in module utils.assign_chunk_nums)": [[63, "utils.assign_chunk_nums.reduce_chunks", false]], "remove_active_user() (in module features.get_user_network)": [[9, "features.get_user_network.remove_active_user", false]], "save_features() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.save_features", false]], "sentence_pad() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.sentence_pad", false]], "sentence_split() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.sentence_split", false]], "sentenciser() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.sentenciser", false]], "set_self_conv_data() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.set_self_conv_data", false]], "str_to_vec() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.str_to_vec", false]], "text_based_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.text_based_features", false]], "token_count() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.token_count", false]], "train_spacy_ner() (in module features.named_entity_recognition_features)": [[15, "features.named_entity_recognition_features.train_spacy_ner", false]], "user_level_features() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.user_level_features", false]], "userlevelfeaturescalculator (class in utils.calculate_user_level_features)": [[66, "utils.calculate_user_level_features.UserLevelFeaturesCalculator", false]], "utils.assign_chunk_nums": [[63, "module-utils.assign_chunk_nums", false]], "utils.calculate_chat_level_features": [[64, "module-utils.calculate_chat_level_features", false]], "utils.calculate_conversation_level_features": [[65, "module-utils.calculate_conversation_level_features", false]], "utils.calculate_user_level_features": [[66, "module-utils.calculate_user_level_features", false]], "utils.check_embeddings": [[67, "module-utils.check_embeddings", false]], "utils.gini_coefficient": [[68, "module-utils.gini_coefficient", false]], "utils.preload_word_lists": [[70, "module-utils.preload_word_lists", false]], "utils.preprocess": [[71, "module-utils.preprocess", false]], "utils.summarize_features": [[72, "module-utils.summarize_features", false]], "utils.zscore_chats_and_conversation": [[73, "module-utils.zscore_chats_and_conversation", false]], "word_start() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.word_start", false]]}, "objects": {"": [[2, 0, 0, "-", "feature_builder"]], "feature_builder": [[2, 1, 1, "", "FeatureBuilder"]], "feature_builder.FeatureBuilder": [[2, 2, 1, "", "chat_level_features"], [2, 2, 1, "", "conv_level_features"], [2, 2, 1, "", "featurize"], [2, 2, 1, "", "get_first_pct_of_chat"], [2, 2, 1, "", "merge_conv_data_with_original"], [2, 2, 1, "", "preprocess_chat_data"], [2, 2, 1, "", "save_features"], [2, 2, 1, "", "set_self_conv_data"], [2, 2, 1, "", "user_level_features"]], "features": [[3, 0, 0, "-", "basic_features"], [4, 0, 0, "-", "burstiness"], [5, 0, 0, "-", "certainty"], [6, 0, 0, "-", "discursive_diversity"], [7, 0, 0, "-", "fflow"], [8, 0, 0, "-", "get_all_DD_features"], [9, 0, 0, "-", "get_user_network"], [10, 0, 0, "-", "hedge"], [12, 0, 0, "-", "info_exchange_zscore"], [13, 0, 0, "-", "information_diversity"], [14, 0, 0, "-", "lexical_features_v2"], [15, 0, 0, "-", "named_entity_recognition_features"], [16, 0, 0, "-", "other_lexical_features"], [17, 0, 0, "-", "politeness_features"], [18, 0, 0, "-", "politeness_v2"], [19, 0, 0, "-", "politeness_v2_helper"], [20, 0, 0, "-", "question_num"], [21, 0, 0, "-", "readability"], [22, 0, 0, "-", "reddit_tags"], [23, 0, 0, "-", "temporal_features"], [24, 0, 0, "-", "textblob_sentiment_analysis"], [25, 0, 0, "-", "turn_taking_features"], [26, 0, 0, "-", "variance_in_DD"], [27, 0, 0, "-", "within_person_discursive_range"], [28, 0, 0, "-", "word_mimicry"]], "features.basic_features": [[3, 3, 1, "", "count_characters"], [3, 3, 1, "", "count_messages"], [3, 3, 1, "", "count_words"]], "features.burstiness": [[4, 3, 1, "", "burstiness"], [4, 3, 1, "", "get_team_burstiness"]], "features.certainty": [[5, 3, 1, "", "get_certainty"]], "features.discursive_diversity": [[6, 3, 1, "", "get_DD"], [6, 3, 1, "", "get_cosine_similarity"], [6, 3, 1, "", "get_unique_pairwise_combos"]], "features.fflow": [[7, 3, 1, "", "get_forward_flow"]], "features.get_all_DD_features": [[8, 3, 1, "", "conv_to_float_arr"], [8, 3, 1, "", "get_DD_features"]], "features.get_user_network": [[9, 3, 1, "", "get_user_network"], [9, 3, 1, "", "remove_active_user"]], "features.hedge": [[10, 3, 1, "", "is_hedged_sentence_1"]], "features.info_exchange_zscore": [[12, 3, 1, "", "get_info_exchange_wordcount"]], "features.information_diversity": [[13, 3, 1, "", "calculate_ID_score"], [13, 3, 1, "", "get_info_diversity"], [13, 3, 1, "", "info_diversity"], [13, 3, 1, "", "preprocessing"]], "features.lexical_features_v2": [[14, 3, 1, "", "get_liwc_count"], [14, 3, 1, "", "liwc_features"]], "features.named_entity_recognition_features": [[15, 3, 1, "", "built_spacy_ner"], [15, 3, 1, "", "calculate_named_entities"], [15, 3, 1, "", "named_entities"], [15, 3, 1, "", "num_named_entity"], [15, 3, 1, "", "train_spacy_ner"]], "features.other_lexical_features": [[16, 3, 1, "", "classify_NTRI"], [16, 3, 1, "", "get_proportion_first_pronouns"], [16, 3, 1, "", "get_word_TTR"]], "features.politeness_features": [[17, 3, 1, "", "get_politeness_strategies"]], "features.politeness_v2": [[18, 3, 1, "", "get_politeness_v2"]], "features.politeness_v2_helper": [[19, 3, 1, "", "Question"], [19, 3, 1, "", "adverb_limiter"], [19, 3, 1, "", "bare_command"], [19, 3, 1, "", "clean_text"], [19, 3, 1, "", "commit_data"], [19, 3, 1, "", "conjection_seperator"], [19, 3, 1, "", "count_matches"], [19, 3, 1, "", "count_spacy_matches"], [19, 3, 1, "", "feat_counts"], [19, 3, 1, "", "get_dep_pairs"], [19, 3, 1, "", "get_dep_pairs_noneg"], [19, 3, 1, "", "load_saved_data"], [19, 3, 1, "", "load_to_dict"], [19, 3, 1, "", "load_to_lists"], [19, 3, 1, "", "phrase_split"], [19, 3, 1, "", "prep_simple"], [19, 3, 1, "", "prep_whole"], [19, 3, 1, "", "punctuation_seperator"], [19, 3, 1, "", "sentence_pad"], [19, 3, 1, "", "sentence_split"], [19, 3, 1, "", "sentenciser"], [19, 3, 1, "", "token_count"], [19, 3, 1, "", "word_start"]], "features.question_num": [[20, 3, 1, "", "calculate_num_question_naive"]], "features.readability": [[21, 3, 1, "", "classify_text_dalechall"], [21, 3, 1, "", "count_difficult_words"], [21, 3, 1, "", "count_syllables"], [21, 3, 1, "", "dale_chall_helper"]], "features.reddit_tags": [[22, 3, 1, "", "count_all_caps"], [22, 3, 1, "", "count_bullet_points"], [22, 3, 1, "", "count_ellipses"], [22, 3, 1, "", "count_emojis"], [22, 3, 1, "", "count_emphasis"], [22, 3, 1, "", "count_line_breaks"], [22, 3, 1, "", "count_links"], [22, 3, 1, "", "count_numbering"], [22, 3, 1, "", "count_parentheses"], [22, 3, 1, "", "count_quotes"], [22, 3, 1, "", "count_responding_to_someone"], [22, 3, 1, "", "count_user_references"]], "features.temporal_features": [[23, 3, 1, "", "coerce_to_date_or_number"], [23, 3, 1, "", "get_time_diff"], [23, 3, 1, "", "get_time_diff_startend"]], "features.textblob_sentiment_analysis": [[24, 3, 1, "", "get_polarity_score"], [24, 3, 1, "", "get_subjectivity_score"]], "features.turn_taking_features": [[25, 3, 1, "", "count_turn_taking_index"], [25, 3, 1, "", "count_turns"], [25, 3, 1, "", "get_turn"]], "features.variance_in_DD": [[26, 3, 1, "", "get_variance_in_DD"]], "features.within_person_discursive_range": [[27, 3, 1, "", "get_nan_vector"], [27, 3, 1, "", "get_within_person_disc_range"]], "features.word_mimicry": [[28, 3, 1, "", "Content_mimicry_score"], [28, 3, 1, "", "Content_mimicry_score_per_conv"], [28, 3, 1, "", "computeTF"], [28, 3, 1, "", "compute_frequency"], [28, 3, 1, "", "compute_frequency_per_conv"], [28, 3, 1, "", "function_mimicry_score"], [28, 3, 1, "", "get_content_words_in_message"], [28, 3, 1, "", "get_function_words_in_message"], [28, 3, 1, "", "get_mimicry_bert"], [28, 3, 1, "", "get_moving_mimicry"], [28, 3, 1, "", "mimic_words"]], "utils": [[63, 0, 0, "-", "assign_chunk_nums"], [64, 0, 0, "-", "calculate_chat_level_features"], [65, 0, 0, "-", "calculate_conversation_level_features"], [66, 0, 0, "-", "calculate_user_level_features"], [67, 0, 0, "-", "check_embeddings"], [68, 0, 0, "-", "gini_coefficient"], [70, 0, 0, "-", "preload_word_lists"], [71, 0, 0, "-", "preprocess"], [72, 0, 0, "-", "summarize_features"], [73, 0, 0, "-", "zscore_chats_and_conversation"]], "utils.assign_chunk_nums": [[63, 3, 1, "", "assign_chunk_nums"], [63, 3, 1, "", "create_chunks"], [63, 3, 1, "", "create_chunks_messages"], [63, 3, 1, "", "reduce_chunks"]], "utils.calculate_chat_level_features": [[64, 1, 1, "", "ChatLevelFeaturesCalculator"]], "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator": [[64, 2, 1, "", "calculate_chat_level_features"], [64, 2, 1, "", "calculate_hedge_features"], [64, 2, 1, "", "calculate_politeness_sentiment"], [64, 2, 1, "", "calculate_politeness_v2"], [64, 2, 1, "", "calculate_textblob_sentiment"], [64, 2, 1, "", "calculate_vector_word_mimicry"], [64, 2, 1, "", "calculate_word_mimicry"], [64, 2, 1, "", "concat_bert_features"], [64, 2, 1, "", "get_certainty_score"], [64, 2, 1, "", "get_dale_chall_score_and_classfication"], [64, 2, 1, "", "get_forward_flow"], [64, 2, 1, "", "get_named_entity"], [64, 2, 1, "", "get_reddit_features"], [64, 2, 1, "", "get_temporal_features"], [64, 2, 1, "", "info_exchange"], [64, 2, 1, "", "lexical_features"], [64, 2, 1, "", "other_lexical_features"], [64, 2, 1, "", "positivity_zscore"], [64, 2, 1, "", "text_based_features"]], "utils.calculate_conversation_level_features": [[65, 1, 1, "", "ConversationLevelFeaturesCalculator"]], "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator": [[65, 2, 1, "", "calculate_conversation_level_features"], [65, 2, 1, "", "calculate_info_diversity"], [65, 2, 1, "", "calculate_team_burstiness"], [65, 2, 1, "", "get_conversation_level_aggregates"], [65, 2, 1, "", "get_discursive_diversity_features"], [65, 2, 1, "", "get_gini_features"], [65, 2, 1, "", "get_turn_taking_features"], [65, 2, 1, "", "get_user_level_aggregates"]], "utils.calculate_user_level_features": [[66, 1, 1, "", "UserLevelFeaturesCalculator"]], "utils.calculate_user_level_features.UserLevelFeaturesCalculator": [[66, 2, 1, "", "calculate_user_level_features"], [66, 2, 1, "", "get_centroids"], [66, 2, 1, "", "get_user_level_averaged_features"], [66, 2, 1, "", "get_user_level_summary_statistics_features"], [66, 2, 1, "", "get_user_level_summed_features"], [66, 2, 1, "", "get_user_network"]], "utils.check_embeddings": [[67, 3, 1, "", "check_embeddings"], [67, 3, 1, "", "generate_bert"], [67, 3, 1, "", "generate_certainty_pkl"], [67, 3, 1, "", "generate_lexicon_pkl"], [67, 3, 1, "", "generate_vect"], [67, 3, 1, "", "get_nan_vector"], [67, 3, 1, "", "get_sentiment"], [67, 3, 1, "", "read_in_lexicons"], [67, 3, 1, "", "str_to_vec"]], "utils.gini_coefficient": [[68, 3, 1, "", "get_gini"], [68, 3, 1, "", "gini_coefficient"]], "utils.preload_word_lists": [[70, 3, 1, "", "get_dale_chall_easy_words"], [70, 3, 1, "", "get_first_person_words"], [70, 3, 1, "", "get_function_words"], [70, 3, 1, "", "get_question_words"]], "utils.preprocess": [[71, 3, 1, "", "assert_key_columns_present"], [71, 3, 1, "", "compress"], [71, 3, 1, "", "create_cumulative_rows"], [71, 3, 1, "", "get_turn_id"], [71, 3, 1, "", "preprocess_conversation_columns"], [71, 3, 1, "", "preprocess_naive_turns"], [71, 3, 1, "", "preprocess_text"], [71, 3, 1, "", "preprocess_text_lowercase_but_retain_punctuation"]], "utils.summarize_features": [[72, 3, 1, "", "get_average"], [72, 3, 1, "", "get_max"], [72, 3, 1, "", "get_min"], [72, 3, 1, "", "get_stdev"], [72, 3, 1, "", "get_sum"], [72, 3, 1, "", "get_user_average_dataframe"], [72, 3, 1, "", "get_user_sum_dataframe"]], "utils.zscore_chats_and_conversation": [[73, 3, 1, "", "get_zscore_across_all_chats"], [73, 3, 1, "", "get_zscore_across_all_conversations"]]}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function"}, "terms": {"": [0, 1, 2, 4, 5, 9, 11, 13, 25, 28, 29, 31, 32, 34, 35, 36, 40, 41, 42, 43, 44, 45, 46, 47, 49, 50, 51, 52, 55, 59, 61, 62, 64, 65, 66], "0": [0, 1, 2, 5, 10, 13, 16, 21, 24, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 45, 46, 47, 50, 51, 53, 55, 59, 61], "000": 42, "00222437221134802": [5, 64], "01": 51, "02": 51, "04": 40, "0496": [21, 33], "05": [13, 40, 50, 51], "06": 51, "08": 50, "09": [45, 46, 50], "1": [0, 1, 2, 3, 10, 13, 22, 24, 32, 34, 35, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 51, 53, 55, 56, 57, 59, 61, 62], "10": [1, 5, 6, 21, 24, 33, 42, 59, 61, 64], "100": [1, 21, 33, 37, 42, 47, 62], "1000": 42, "10th": 33, "1145": [21, 24], "1177": [5, 64], "11th": 33, "12": [35, 45, 46, 50], "1287": 6, "12th": 33, "13": 50, "14": 50, "15": [37, 50], "1579": [21, 33], "17": 50, "1948": 33, "195": 36, "1977": 62, "1d": 67, "1lpngokujsx": 5, "1st": 50, "1st_person": 50, "1st_person_pl": 50, "1st_person_start": 50, "2": [0, 1, 2, 34, 35, 41, 47, 59, 61, 62], "20": [37, 59], "2004": 42, "2007": [5, 42], "2009": 60, "2012": 55, "2013": [12, 16, 31, 32, 36, 37, 38, 41, 43, 50, 52, 54, 70], "2015": [53, 58, 60], "2016": 4, "2017": 13, "2018": [40, 44, 55], "2019": [35, 52], "2020": [18, 21, 24, 33, 49, 50, 56, 57], "2021": [1, 6, 43, 44], "2022": [13, 34], "2023": [1, 5, 30, 59, 61, 64], "2024": 40, "21": 59, "22": [41, 50], "2384068": 4, "24": [1, 61], "25": 47, "27": 50, "28": 50, "29": 50, "2nd": 50, "2nd_person": 50, "2nd_person_start": 50, "3": [0, 1, 2, 21, 34, 41, 42, 51, 59, 61, 71], "30": 50, "3000": 33, "32": [34, 50], "3432929": [21, 24], "35": 51, "36": 50, "38": 50, "39": 49, "39512260": 68, "3n": 59, "4": [0, 1, 5, 13, 21, 30, 33, 41, 42, 56, 61, 62], "4274": 6, "43": 50, "45": 50, "47": 50, "49": 50, "4pit4bqz6": 5, "4th": [21, 33], "5": [1, 5, 21, 30, 33, 37, 41, 59], "50": [1, 47], "52": 50, "53": 50, "57": 50, "58": 50, "5th": 33, "6": [1, 33, 43], "60": 51, "63": 50, "6365": 21, "64": 67, "68": 47, "6th": 33, "7": [30, 33, 48], "70": 50, "78": [35, 50], "7th": 33, "8": [1, 30, 33], "80": [21, 70], "82": 41, "85": 34, "86": 35, "87": 50, "89": [45, 46], "8th": 33, "9": [2, 5, 21, 30, 33, 40, 47, 50], "9123": 47, "92": 51, "93chall_readability_formula": [21, 70], "94": 15, "95": 47, "97": 51, "9855072464": 47, "9992": 47, "99954": 47, "9th": 33, "A": [1, 2, 4, 12, 13, 14, 15, 16, 17, 18, 19, 21, 23, 25, 28, 33, 34, 35, 37, 38, 40, 41, 44, 45, 46, 47, 49, 50, 51, 52, 57, 59, 60, 61, 62, 66, 67, 68, 70, 71, 72, 73], "And": [1, 62], "As": [1, 31, 35, 36, 40, 45, 61], "But": [1, 50, 62], "By": [1, 42, 50], "For": [0, 1, 31, 34, 37, 41, 42, 43, 47, 49, 54, 56, 59, 62, 65], "If": [0, 1, 2, 5, 21, 29, 30, 35, 45, 47, 50, 55, 61, 62, 63, 64, 67, 71], "In": [1, 21, 30, 31, 34, 35, 36, 37, 39, 41, 42, 45, 46, 47, 50, 55, 59, 61, 62], "It": [1, 2, 31, 32, 33, 36, 37, 41, 44, 45, 46, 50, 64, 65, 66, 67, 71], "NO": 37, "NOT": [1, 61], "No": [19, 53], "Not": 41, "One": [1, 37, 61], "That": [29, 55], "The": [1, 2, 3, 4, 5, 7, 9, 10, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 28, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 58, 59, 60, 63, 64, 65, 66, 67, 68, 69, 71, 72, 73], "Then": [1, 55, 61], "There": [1, 11, 32, 61, 66], "These": [1, 11, 17, 32, 34, 42, 48, 52, 61, 62, 69], "To": [0, 1, 29, 31, 34, 37, 40, 55, 56, 57, 61, 62], "WITH": 21, "Will": 50, "_deviat": 55, "_preprocessed_": 0, "abil": [13, 29], "abl": [31, 36, 61], "abort": 1, "about": [1, 12, 29, 31, 36, 41, 47, 61, 62], "abov": [1, 21, 34, 61], "abstract_id": 4, "accept": [0, 1, 58, 61], "access": [0, 1, 15, 61], "accommod": [28, 32, 39, 45, 46, 64, 65, 66], "accord": [21, 37, 59, 64, 70], "accordingli": 63, "account": [1, 29, 32, 42], "accus": 50, "achiev": [50, 62], "acknowledg": 49, "acm": [21, 24], "acommod": 36, "across": [1, 13, 28, 31, 34, 40, 41, 50, 62, 64, 73], "action": 59, "activ": [1, 9, 44, 55, 71], "actual": [41, 56], "ad": [61, 62, 71], "adapt": 59, "add": [0, 1, 2, 21, 51, 61], "addit": [2, 32, 34, 42, 63, 69], "addition": [0, 30, 31, 32, 54], "address": 1, "adjac": 71, "adjust": [0, 21, 37, 63], "advanc": [31, 36], "advantag": 4, "adverb": [19, 31, 36], "adverb_limit": [19, 49], "affect": [0, 1, 29, 35, 44], "affirm": 49, "after": [0, 1, 31, 34, 36, 43, 61, 62, 64], "again": [32, 34], "against": [28, 31, 36, 52], "agarw": 62, "aggreg": [0, 1, 3, 11, 37, 44, 61, 62, 65, 66, 72], "agre": 47, "agreement": 49, "ah": [31, 36], "ai": 62, "aim": [39, 62], "airtim": [37, 62], "al": [1, 5, 16, 18, 21, 24, 30, 31, 32, 33, 34, 35, 36, 38, 42, 43, 44, 49, 50, 52, 53, 54, 56, 57, 58, 59, 60, 64], "algorithm": [56, 57], "align": [35, 51], "all": [0, 1, 2, 6, 12, 13, 15, 19, 22, 28, 30, 31, 34, 35, 36, 37, 40, 41, 42, 46, 48, 49, 51, 52, 55, 58, 61, 62, 64, 66, 71, 73], "allow": 1, "almaatouq": 59, "along": 1, "alongsid": 1, "alphabet": 49, "alphanumer": 71, "alreadi": [0, 1, 2, 4, 10, 12, 16, 67], "also": [0, 1, 2, 28, 30, 31, 32, 34, 36, 37, 38, 42, 47, 51, 54, 60, 61, 62, 64, 65, 67, 69, 71], "alsobai": 59, "altern": 59, "although": [1, 23, 31, 36], "alwai": [1, 55], "am": [31, 36, 42, 54, 62], "amaz": [48, 56], "ambient": 32, "american": 33, "ami": [47, 59, 62], "amic": 62, "among": [36, 37, 52, 55, 62], "amongst": [6, 35, 48], "an": [0, 1, 2, 5, 8, 11, 12, 13, 21, 29, 30, 31, 32, 33, 34, 36, 38, 40, 41, 42, 45, 47, 48, 50, 51, 52, 54, 59, 60, 61, 62, 63, 65, 66, 67, 68], "analys": [1, 62], "analysi": [0, 1, 11, 52, 62, 67, 71], "analyt": 62, "analyz": [0, 1, 2, 13, 14, 16, 17, 19, 20, 21, 22, 24, 28, 43, 52, 62, 67, 71], "analyze_first_pct": [0, 1, 2], "angri": 47, "ani": [0, 1, 29, 31, 33, 38, 54, 62, 71], "annot": [17, 50], "anoth": [30, 34, 36, 48], "answer": 29, "anybodi": [31, 36], "anyth": [1, 2, 23, 31, 36, 56], "anywher": [31, 36], "apartment": 42, "api": [2, 47], "api_refer": 24, "apolog": [17, 50], "apologi": 49, "appear": [0, 15, 28, 31, 37, 38, 42, 64], "append": [1, 17, 64, 65, 66, 67], "appli": [4, 13, 18, 62, 64, 69], "applic": [29, 71], "appreci": 50, "approach": [32, 38, 42, 45, 46, 49, 53, 64], "appropri": [31, 69], "ar": [0, 1, 2, 3, 5, 9, 10, 11, 15, 17, 19, 21, 23, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 51, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 69, 71], "arcross": 34, "area": 62, "aren": [31, 36], "around": 2, "arous": 48, "arrai": [6, 8, 67, 68], "articl": [37, 50], "ask": [20, 47, 54], "ask_ag": 49, "aspect": [50, 62], "assert_key_columns_pres": 71, "assign": [1, 31, 36, 38, 45, 46, 52, 59, 61, 63, 71], "assign_chunk_num": 69, "associ": [4, 15, 21, 29, 30, 31, 32, 36, 40, 45, 46, 47, 48, 61], "assum": [0, 1, 2, 10, 12, 16, 23, 31, 41, 60, 61, 71], "assumpt": [1, 41, 61], "asterisk": 22, "attribut": [1, 11, 34, 51, 52, 56, 62], "author": [5, 31, 36, 59], "auto": 2, "automat": [0, 1, 61, 69], "auxiliari": [31, 36], "avail": [0, 1, 61, 62, 63, 64, 67], "averag": [11, 13, 28, 30, 33, 34, 35, 40, 41, 46, 52, 64, 65, 66, 72], "avil": 62, "avoid": 30, "awar": 29, "awesom": 62, "b": [4, 34, 35, 45, 46, 55, 62], "back": 62, "bag": [32, 38, 42, 45, 46, 49, 53, 56, 57], "bare_command": [19, 49], "base": [0, 1, 2, 15, 18, 19, 31, 32, 34, 35, 36, 37, 40, 42, 51, 52, 53, 54, 55, 56, 57, 61, 62, 63, 64, 65, 66, 71], "basic": [10, 11, 12, 16, 61, 62], "basic_featur": 11, "batch": 67, "batch_num": 1, "batch_siz": 67, "bay": [56, 57], "bbevi": 18, "becaus": [1, 2, 12, 21, 31, 36, 40, 56, 61], "becom": [44, 61, 62], "been": [1, 2, 12, 16, 31, 36, 61], "befor": [0, 1, 2, 17, 31, 36, 45, 48], "beforehand": 64, "begin": [34, 54, 58, 61, 62, 63], "behavior": [0, 2, 62, 63], "being": [4, 13, 14, 16, 17, 20, 21, 24, 31, 32, 36, 43, 47, 51, 55, 56, 60], "belong": [1, 42], "below": [1, 11, 21, 33, 36, 45, 48, 51, 61, 62, 69], "ber": 54, "bert": [0, 1, 31, 35, 36, 39, 46, 61, 64, 67], "bert_path": 67, "bert_sentiment_data": [1, 61, 64], "best": 29, "better": [31, 61], "between": [4, 6, 13, 21, 23, 24, 28, 30, 31, 34, 35, 36, 37, 40, 45, 46, 55, 58, 59, 62, 64, 65], "betwen": 34, "beyond": 2, "big": 59, "binari": [10, 32, 38], "blame": 47, "blob": [1, 24, 61], "block": [22, 32, 48, 59], "blog": 15, "bold": [22, 64], "bool": [2, 63, 67, 71], "bootstrap": 62, "both": [1, 2, 42, 52, 54, 55, 59, 62], "bother": 50, "bottom": 59, "bought": 41, "bound": [29, 35, 36, 37, 42, 52, 55], "boundari": [34, 35], "break": [22, 48, 64], "brief": 44, "broader": 52, "broken": 59, "btw": 50, "bug": [1, 61], "build": [1, 7, 34, 45, 46, 62], "built": 11, "built_spacy_n": 15, "bullet": [22, 48, 64], "bunch": 59, "burst": 58, "bursti": [1, 11, 39, 58, 61, 65], "by_the_wai": 49, "c": [12, 34, 35, 45, 46, 62], "cach": [0, 1, 2, 51, 61], "calcul": [2, 5, 11, 12, 16, 18, 21, 28, 33, 41, 48, 49, 50, 56, 57, 58, 60, 62, 63, 64, 65, 66, 67, 68, 72, 73], "calculate_chat_level_featur": [1, 61, 69], "calculate_conversation_level_featur": 69, "calculate_hedge_featur": 64, "calculate_id_scor": 13, "calculate_info_divers": 65, "calculate_named_ent": 15, "calculate_num_question_na": 20, "calculate_politeness_senti": 64, "calculate_politeness_v2": 64, "calculate_team_bursti": 65, "calculate_textblob_senti": 64, "calculate_user_level_featur": 69, "calculate_vector_word_mimicri": 64, "calculate_word_mimicri": 64, "call": [1, 2, 8, 13, 61, 62, 64, 69], "can": [0, 1, 2, 11, 23, 31, 32, 33, 34, 36, 37, 42, 43, 44, 47, 48, 49, 50, 52, 54, 60, 61, 62, 69], "can_you": 49, "cannot": [1, 31, 36, 45, 46, 49, 62], "cao": [21, 24, 33, 43, 44, 56, 57, 62], "cap": [22, 48, 64], "capit": [0, 2, 48], "captur": [29, 30, 32, 34, 35, 38, 41, 42, 55], "caract": 40, "cardiffnlp": [1, 61], "carefulli": 60, "carri": 31, "casa_token": 5, "case": [1, 13, 16, 28, 29, 30, 31, 36, 37, 41, 45, 46, 51, 55, 56, 59, 61], "casual": 43, "categori": [21, 32, 45, 46, 49, 52], "caus": [31, 32, 36, 59], "caveat": 1, "center": 62, "central": 34, "centroid": [34, 66], "certain": [5, 19, 30, 42, 45, 46, 49], "certainli": 42, "certainti": [11, 38, 39, 42, 64, 67], "cfm": 4, "chall": [1, 21, 39, 64, 70], "chang": [0, 1, 34, 50, 61, 71], "charact": [2, 3, 15, 19, 37, 49, 62, 64, 65, 66, 71], "characterist": 62, "chat": [0, 1, 2, 4, 5, 6, 7, 8, 12, 13, 14, 16, 23, 25, 28, 29, 32, 35, 36, 41, 44, 45, 46, 49, 59, 61, 63, 64, 65, 66, 67, 68, 69, 71, 72, 73], "chat_data": [2, 6, 7, 8, 26, 27, 28, 63, 64, 65, 66, 67, 71], "chat_df": 14, "chat_featur": [1, 61], "chat_level_data": 72, "chat_level_featur": 2, "chatlevelfeaturescalcul": [1, 2, 17, 21, 61, 64, 69], "chats_data": 73, "check": [19, 23, 44, 64, 67, 71], "check_embed": [1, 61, 69], "chen": 62, "choos": 60, "chose": 1, "chronolog": 1, "chunk": [34, 59, 63], "chunk_num": 63, "circlelyt": 13, "citat": [21, 24], "cite": 50, "clarif": [16, 32, 64], "class": [1, 2, 31, 61, 62, 64, 65, 66], "classif": [21, 64], "classifi": [16, 21, 50, 56, 57], "classify_ntri": 16, "classify_text_dalechal": 21, "clean": [2, 17, 19, 67], "clean_text": 19, "clear": 1, "close": [31, 48, 62], "closer": [45, 46, 59], "clue": 62, "cmu": 12, "code": [6, 18, 29, 32, 51, 55, 61, 62, 68], "coeffici": [4, 39, 62, 65, 68], "coerce_to_date_or_numb": 23, "cognit": 62, "col": 2, "colab": [0, 1], "collabor": [59, 62], "collaps": 2, "collect": [1, 2, 34, 49, 50, 52, 61, 62], "colleg": 33, "column": [0, 2, 4, 6, 7, 8, 9, 12, 13, 14, 16, 18, 23, 25, 28, 51, 56, 62, 63, 64, 65, 66, 67, 68, 71, 72, 73], "column_count_frequ": 28, "column_count_mim": 28, "column_mimc": 28, "column_nam": 71, "column_to_summar": 72, "com": [1, 2, 4, 5, 13, 15, 18, 64, 68, 71], "comb": 62, "combin": [0, 1, 6, 28, 64, 71], "come": [1, 12, 13, 21, 32, 33, 58, 61], "comm": [1, 61], "command": [1, 61], "comment": 48, "commit": 23, "commit_data": 19, "common": [0, 32, 62, 64], "commonli": 37, "commun": [0, 1, 11, 44, 48, 55, 60, 62, 64], "companion": 1, "compar": [2, 31, 35, 44, 45, 52, 64, 71, 73], "compat": [0, 1, 61], "complement": [31, 36], "complet": [1, 2, 31, 55], "complex": [0, 35, 43, 50, 62], "compon": 50, "comprehens": [33, 48], "compress": 71, "comput": [0, 2, 4, 5, 6, 10, 11, 12, 13, 14, 28, 29, 30, 31, 34, 35, 36, 37, 40, 41, 42, 45, 46, 49, 52, 55, 62, 64, 65, 66, 69, 73], "compute_frequ": 28, "compute_frequency_per_conv": 28, "compute_vectors_from_preprocess": [0, 2], "computetf": 28, "conain": 61, "concat_bert_featur": [1, 61, 64], "concaten": [19, 49, 64, 71], "concentr": 55, "concept": [29, 39, 42, 62], "conceptu": [61, 62], "concis": 43, "concret": 29, "conduct": 1, "confid": [2, 5, 15, 30, 47, 64], "conflict": 62, "confound": 44, "congruent": 34, "conjection_seper": 19, "conjunct": [19, 31, 36, 49], "conjunction_start": 49, "connect": 39, "conscious": 35, "consecut": 22, "consequ": 0, "consid": [1, 33, 37], "consider": [61, 62], "consist": [31, 36, 40, 41], "constitut": 41, "constrain": [34, 35], "construct": [11, 55, 62], "constructor": 47, "consult": 5, "contain": [1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 23, 25, 28, 29, 30, 35, 38, 42, 47, 49, 55, 61, 62, 63, 64, 67, 71, 72, 73], "content": [0, 1, 12, 13, 28, 34, 36, 39, 41, 42, 45, 46, 62, 64, 67], "content_mimicry_scor": 28, "content_mimicry_score_per_conv": 28, "content_word_accommod": 31, "content_word_accommodation_per_conv": 31, "content_word_mimicri": 28, "context": [2, 32, 42, 48, 62, 71], "continu": [56, 57], "contract": 49, "contrast": 39, "contribut": [13, 34, 37, 62], "control": 1, "conv": [1, 61], "conv_data": [2, 65], "conv_features_al": [1, 61], "conv_features_bas": [1, 61], "conv_level_featur": 2, "conv_to_float_arr": 8, "convei": [6, 34, 52], "conveni": [1, 61], "convers": [0, 2, 3, 4, 6, 7, 8, 9, 12, 13, 23, 25, 28, 29, 31, 34, 35, 36, 37, 40, 41, 43, 44, 45, 46, 47, 48, 49, 50, 52, 55, 58, 59, 61, 63, 64, 65, 66, 68, 71, 72, 73], "conversation_id": [2, 28, 61, 71], "conversation_id_col": [0, 1, 2, 4, 6, 7, 8, 9, 13, 23, 25, 26, 27, 61, 63, 64, 65, 66, 68, 72, 73], "conversation_num": [0, 1, 2, 6, 7, 66, 71, 73], "conversationlevelfeaturescalcul": [2, 65, 69], "convert": [8, 41, 49, 71], "convict": 5, "convokit": [17, 50, 62, 64], "coordin": 55, "copi": [0, 1], "copular": [31, 36], "core": [34, 64, 69], "cornel": 17, "corpu": [0, 1, 50], "corrado": 37, "correl": [41, 55], "correspond": [30, 34, 35, 40, 49, 55, 66], "cosin": [6, 7, 13, 28, 31, 34, 35, 36, 40, 45, 46, 65], "could": [1, 31, 33, 36, 50, 54], "could_you": 49, "couldn": [31, 36], "count": [1, 3, 12, 14, 15, 16, 19, 21, 25, 28, 30, 31, 32, 36, 39, 41, 43, 44, 49, 52, 53, 54, 56, 58, 64, 65, 66], "count_all_cap": 22, "count_bullet_point": 22, "count_charact": 3, "count_difficult_word": 21, "count_ellips": 22, "count_emoji": 22, "count_emphasi": 22, "count_line_break": 22, "count_link": 22, "count_match": [19, 49], "count_messag": 3, "count_numb": 22, "count_parenthes": 22, "count_quot": 22, "count_responding_to_someon": 22, "count_spacy_match": 19, "count_syl": 21, "count_turn": 25, "count_turn_taking_index": 25, "count_user_refer": 22, "count_word": 3, "countabl": 65, "countd": 36, "counterfactu": 50, "cours": [16, 31, 34, 36, 63], "cover": 28, "creat": [0, 1, 2, 13, 19, 31, 40, 42, 61, 62, 64, 65, 66, 71], "create_chunk": 63, "create_chunks_messag": 63, "create_cumulative_row": 71, "credit": 33, "crowd": 13, "csv": [1, 2, 61, 62, 67], "cumul": [1, 2, 71], "cumulative_group": [0, 1, 2, 71], "current": [1, 11, 23, 31, 34, 35, 36, 40, 45, 46, 58, 61, 64, 71], "curt": 43, "custom": [0, 62], "custom_featur": [0, 1, 2, 61], "customiz": 62, "cut": 1, "cutoff": [2, 15, 47, 64], "d": [0, 1, 2, 31, 34, 36, 61], "dale": [1, 21, 39, 64, 70], "dale_chall_help": 21, "danescu": 50, "dash": 22, "data": [0, 2, 6, 7, 8, 9, 13, 19, 20, 32, 37, 40, 41, 47, 51, 55, 61, 62, 63, 64, 65, 66, 67, 68, 71, 72, 73], "datafram": [0, 1, 2, 4, 6, 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 19, 23, 25, 28, 37, 47, 49, 59, 61, 62, 63, 64, 65, 66, 67, 68, 71, 72, 73], "dataknowsal": 15, "dataset": [1, 2, 9, 12, 13, 28, 31, 41, 47, 52, 61, 64, 65, 66, 73], "date": [1, 61], "datetim": [23, 58], "dcosta": 62, "deal": [50, 59], "death": 1, "debat": 59, "decid": 62, "decis": [1, 13, 62], "declar": [1, 62, 69], "deepli": 62, "default": [0, 1, 2, 5, 13, 16, 23, 30, 34, 35, 42, 47, 62, 63, 66, 67, 71, 73], "defer": [17, 50], "defin": [0, 11, 21, 31, 34, 36, 40, 59, 62, 64, 65, 66, 70], "definit": [1, 3, 44], "degre": [6, 30, 36, 45, 46, 55], "delet": 29, "deliber": 1, "demo": 61, "democrat": 1, "demystifi": 62, "denomin": 59, "densiti": 60, "dep_": 49, "dep_pair": 19, "depend": [0, 1, 10, 19, 32, 49, 52, 61, 63], "deriv": [2, 11, 65, 66], "describ": [1, 11, 62], "descript": [1, 61], "design": [0, 1, 2, 13, 34, 62], "desir": [2, 63, 72], "detail": [0, 33, 41, 43, 61, 62], "detect": [1, 32, 37, 38, 47, 48, 49, 54], "determin": [13, 18, 31, 35, 36, 40, 45, 46, 71], "dev": 24, "develop": [5, 37, 40, 62], "deviat": [4, 5, 29, 40, 41, 55, 58, 65, 72, 73], "df": [4, 8, 9, 12, 13, 16, 18, 23, 28, 63, 71], "dict": [17, 19, 28, 67], "dictionari": [1, 15, 17, 19, 28, 30, 42, 49, 61, 67], "did": [1, 31, 36, 37, 47, 50, 54, 62], "didn": [31, 36], "differ": [0, 1, 2, 4, 11, 12, 23, 28, 29, 31, 34, 36, 37, 39, 40, 44, 45, 46, 47, 49, 55, 62, 63, 64, 65, 66, 71], "differenti": [49, 59], "difficult": [21, 33], "difficult_word": 21, "difficulti": 33, "dimens": [40, 62], "dimension": [34, 35], "dinner": 41, "direct": [34, 43, 45, 47, 50, 69], "direct_quest": [32, 50, 54], "direct_start": 50, "directli": [1, 62, 69], "directori": [0, 2, 19, 61, 65, 67], "disagr": 49, "disagre": 51, "discours": [31, 36], "discret": [31, 36, 45, 46], "discurs": [0, 1, 6, 8, 39, 40, 61, 65, 66], "discursive_divers": 11, "discus": 8, "discuss": [0, 1, 31, 34, 39, 40, 42, 43, 61, 62, 71], "dispers": 68, "displai": [1, 34, 42, 46, 61], "dispos": 1, "distanc": [34, 35, 40], "distinct": [31, 36, 59], "distinguish": 59, "distribut": 31, "div": 16, "diverg": [6, 34, 35], "divers": [0, 1, 6, 8, 13, 39, 61, 65], "divid": [16, 34, 59, 63], "dl": [21, 24], "do": [0, 1, 29, 31, 34, 36, 37, 43, 49, 50, 54, 62, 69], "doc": [2, 19], "doc_top": 13, "document": [1, 17, 61, 69], "doe": [1, 2, 29, 40, 42, 43, 45, 47, 54, 61, 71], "doesn": [0, 1, 2, 29, 31, 36, 45, 61], "doi": [5, 6, 21, 24, 64], "domain": [31, 50], "don": [31, 36, 49, 54, 62, 67], "done": [2, 50], "dot": 22, "doubl": 30, "down": [31, 36], "download": [1, 61], "download_resourc": [1, 61], "downstream": [17, 62], "dozen": 62, "drive": [62, 69], "driver": [2, 61, 64, 65, 66], "drop": [0, 2, 64], "due": [34, 59], "duncan": 62, "duplic": [1, 2, 71], "durat": [58, 63], "dure": [2, 55, 59, 62], "dynam": [59, 61], "e": [0, 1, 2, 4, 15, 20, 29, 30, 31, 32, 34, 35, 36, 37, 38, 41, 42, 47, 48, 49, 52, 54, 56, 59, 61, 63, 65, 66, 71], "e2": [21, 70], "each": [0, 1, 2, 3, 4, 7, 8, 9, 11, 12, 15, 17, 19, 23, 25, 28, 30, 31, 34, 35, 36, 37, 40, 41, 42, 44, 45, 46, 47, 49, 50, 51, 52, 55, 59, 61, 62, 63, 64, 65, 66, 67, 68, 69, 71, 72, 73], "earlier": [0, 1, 2, 42], "easi": [1, 21, 62, 70], "easier": 21, "easili": 33, "easy_word": 21, "eat": 34, "echo": 31, "econom": 37, "edg": [29, 59], "edu": [1, 12, 16, 17, 70], "effect": [1, 41], "effici": 1, "effort": 55, "either": [20, 23, 52, 55], "elaps": [23, 58], "element": [1, 6], "ellips": [22, 48, 64], "els": [1, 22, 47, 64], "embed": [8, 31, 34, 35, 36, 45, 46, 65, 66, 67, 69], "emili": [30, 35, 45, 46, 47, 59, 62], "emoji": [22, 48, 64], "emot": [1, 61], "emoticon": 48, "emphas": [22, 48, 64], "emphasi": 48, "empirica": [1, 2, 71], "emploi": 45, "empti": [0, 2, 13, 67], "en": [1, 21, 24, 61, 70], "en_core_web_sm": [1, 61], "enabl": 71, "enclos": 22, "encod": [1, 8], "encompass": 62, "encount": [1, 34, 35, 61], "encourag": 64, "end": [0, 1, 15, 20, 23, 34, 54, 62, 63], "engag": 43, "engin": 2, "english": [34, 42], "enjoi": 62, "ensur": [0, 1, 40, 49, 61, 63, 67, 71], "entir": [1, 12, 28, 31, 36, 40, 41, 52, 59, 62, 73], "entiti": [0, 1, 2, 15, 39, 64], "entityrecogn": 47, "entri": [1, 28, 61], "ep8dauru1ogvjurwdbof5h6ayfbslvughjyiv31d_as6ppbt": 5, "equal": [1, 21, 23, 34, 37, 40, 55, 59, 61, 62, 63], "equival": [0, 1, 41, 55, 61], "eric": 62, "error": [1, 16, 61], "especi": [41, 62], "essenti": [51, 71], "establish": 31, "estim": 31, "et": [1, 5, 16, 18, 21, 24, 30, 31, 32, 33, 34, 35, 36, 38, 42, 43, 44, 49, 50, 52, 53, 54, 56, 57, 58, 59, 60, 64], "etc": [10, 15, 16, 17, 42], "evalu": [5, 47, 50], "evan": 62, "even": [0, 1, 2, 34, 37, 42, 62, 63, 67], "evenli": [34, 55], "event": [1, 34, 55, 61], "ever": 62, "everi": [1, 4, 13, 31, 34, 35, 36, 61, 62], "everybodi": [31, 36], "everyon": [31, 36, 47, 62], "everyth": [31, 36, 56], "everywher": [31, 36], "evolut": 35, "evolv": [35, 71], "exactli": [1, 2, 71], "examin": [40, 62, 63], "exampl": [0, 10, 11, 15, 21, 24, 29, 31, 32, 34, 37, 42, 43, 48, 50, 51, 54, 56, 59, 60, 61, 62], "example_data": 1, "exce": 15, "exchang": [12, 35, 39, 40, 45, 55, 64], "exclud": [0, 41, 42], "exclus": [41, 42], "excus": 32, "exhibit": 35, "exist": [0, 1, 2, 55, 61, 62, 63, 64, 67], "expand": 49, "expect": [1, 37, 47], "expected_valu": 47, "explain": [0, 29], "explan": [29, 43], "explor": [61, 62], "express": [5, 14, 30, 31, 32, 36, 38, 42, 64], "extend": 1, "extens": [43, 44], "extent": [1, 4, 7, 12, 31, 34, 35, 37, 51, 55, 59, 61], "extern": 48, "extra": 51, "extract": [1, 17, 19, 28, 40, 50, 64], "extrem": [55, 56, 57], "face": [1, 51, 61], "facilit": [62, 71], "fact": [4, 35, 50, 54, 59], "factual": [17, 24, 50], "fail": [1, 61], "fals": [0, 1, 2, 31, 54, 61, 71], "famili": 42, "far": [34, 35, 46, 50, 62], "faster": 14, "feat_count": 19, "featuer": 2, "featur": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 63, 64, 65, 66, 67], "feature_build": [0, 1, 61], "feature_dict": [1, 61], "feature_method": [64, 65], "feature_nam": [1, 61], "featurebuild": [0, 2, 47, 69], "features_conceptu": [1, 61], "few": [48, 62], "fewer": [12, 60], "fflow": 11, "field": [13, 17], "file": [0, 2, 12, 14, 19, 61, 65, 67], "filenam": [1, 2, 19], "filenotfounderror": 67, "fill": 71, "filler": [37, 60], "filler_paus": 49, "filter": [19, 62], "final": [1, 2, 34, 42, 62], "find": [1, 19, 28, 50], "fingertip": 62, "finit": 55, "first": [0, 1, 2, 11, 12, 16, 19, 31, 34, 35, 36, 39, 40, 41, 42, 45, 46, 49, 52, 54, 59, 61, 62, 64, 70, 71], "first_person": 12, "first_person_plur": 49, "first_person_raw": [12, 16], "first_person_singl": 49, "five": 37, "fix": 52, "flag": 71, "float": [0, 2, 4, 5, 6, 8, 10, 13, 14, 16, 21, 24, 25, 28, 68], "floor": 59, "flow": [0, 1, 7, 31, 36, 39, 41, 45, 46, 61, 64], "focal": [31, 36], "focu": 41, "folder": [0, 1, 19], "follow": [1, 2, 16, 17, 29, 31, 32, 33, 41, 42, 47, 49, 50, 53, 55, 59, 60, 61, 64, 65], "for_m": 49, "for_you": 49, "forc": [0, 1, 61], "form": 1, "formal": [1, 61], "formal_titl": 49, "format": [0, 1, 8, 17, 22, 47, 48, 61, 62, 64], "former": [45, 46], "formula": [33, 42, 59, 64, 70], "fornt": 1, "forward": [0, 1, 7, 39, 41, 61, 64], "forward_flow": 35, "found": [1, 2, 5, 28, 30, 33, 61, 69], "four": [1, 8], "fourth": 33, "frac": 55, "fraction": 59, "frame": 64, "framework": [49, 50, 62], "frequenc": [28, 31, 44, 64], "frequency_dict": 28, "fridai": 34, "from": [0, 1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 19, 21, 28, 29, 31, 32, 33, 34, 35, 36, 39, 41, 42, 49, 50, 51, 53, 55, 56, 57, 58, 61, 62, 64, 65, 66, 67, 71], "full": [1, 2, 28, 37], "full_empirical_dataset": 1, "fulli": [32, 48], "functinon": 12, "function": [1, 2, 3, 4, 10, 11, 12, 13, 14, 16, 20, 21, 23, 28, 31, 39, 44, 45, 46, 50, 56, 57, 60, 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73], "function_mimic_word": 28, "function_mimicry_scor": 28, "function_word_mimicri": 28, "function_word_refer": 28, "fund": 62, "further": [1, 2, 61, 71], "futur": [23, 66], "g": [0, 1, 4, 15, 20, 29, 31, 32, 36, 37, 38, 41, 42, 47, 48, 52, 54, 59, 61, 63, 65, 66, 71], "game": [1, 2, 59, 71], "gaug": [33, 52], "gener": [0, 2, 9, 11, 12, 16, 21, 31, 34, 35, 36, 40, 42, 45, 46, 49, 51, 59, 67, 69, 71, 72], "generaliz": 23, "generate_bert": 67, "generate_certainty_pkl": 67, "generate_lexicon_pkl": 67, "generate_vect": 67, "gensim": 40, "get": [16, 20, 21, 28, 30, 31, 36, 49, 66, 67], "get_all_dd_featur": 11, "get_averag": 72, "get_centroid": 66, "get_certainti": 5, "get_certainty_scor": 64, "get_content_words_in_messag": 28, "get_conversation_level_aggreg": 65, "get_cosine_similar": 6, "get_dale_chall_easy_word": [21, 70], "get_dale_chall_score_and_classf": 64, "get_dd": 6, "get_dd_featur": 8, "get_dep_pair": [19, 49], "get_dep_pairs_noneg": [19, 49], "get_discursive_diversity_featur": 65, "get_first_pct_of_chat": 2, "get_first_person_word": [12, 70], "get_forward_flow": [7, 64], "get_function_word": 70, "get_function_words_in_messag": 28, "get_gini": 68, "get_gini_featur": 65, "get_info_divers": 13, "get_info_exchange_wordcount": 12, "get_liwc_count": 14, "get_max": 72, "get_mimicry_bert": 28, "get_min": 72, "get_moving_mimicri": 28, "get_named_ent": 64, "get_nan_vector": [27, 67], "get_polarity_scor": 24, "get_politeness_strategi": 17, "get_politeness_v2": 18, "get_proportion_first_pronoun": 16, "get_question_word": 70, "get_reddit_featur": 64, "get_senti": 67, "get_stdev": 72, "get_subjectivity_scor": 24, "get_sum": 72, "get_team_bursti": 4, "get_temporal_featur": [4, 64], "get_time_diff": 23, "get_time_diff_startend": 23, "get_turn": 25, "get_turn_id": 71, "get_turn_taking_featur": 65, "get_unique_pairwise_combo": 6, "get_user_average_datafram": 72, "get_user_level_aggreg": 65, "get_user_level_averaged_featur": 66, "get_user_level_summary_statistics_featur": 66, "get_user_level_summed_featur": 66, "get_user_network": [11, 66], "get_user_sum_datafram": 72, "get_variance_in_dd": 26, "get_within_person_disc_rang": 27, "get_word_ttr": 16, "get_zscore_across_all_chat": 73, "get_zscore_across_all_convers": 73, "gina": 62, "gini": [39, 62, 65, 68], "gini_coeffici": [11, 69], "github": [0, 1, 2, 18, 71], "give": [0, 1, 29, 37], "give_ag": 49, "given": [5, 6, 13, 14, 28, 30, 31, 33, 34, 35, 36, 40, 41, 55, 59, 66, 67, 71], "go": [1, 34, 35, 45, 46, 50, 62], "goal": 62, "good": [50, 56, 62], "goodby": 49, "googl": [0, 1], "got": [31, 36], "gotta": [31, 36], "grade": 33, "grader": 21, "grai": 35, "grammat": 36, "granularli": 35, "grate": 62, "gratitud": [17, 49, 50], "great": [47, 50, 51, 56, 59, 60, 62], "greater": 55, "greet": 50, "groceri": 41, "group": [0, 1, 2, 4, 13, 29, 33, 34, 41, 52, 59, 62, 68, 71, 72], "grouping_kei": [0, 1, 2, 71], "gt": 22, "guess": 10, "gun": 1, "gy": 15, "gym": 34, "ha": [0, 1, 2, 32, 34, 35, 37, 42, 43, 46, 52, 54, 55, 56, 59, 61, 62, 63, 71], "had": [1, 31, 36, 54, 61], "hadn": [31, 36], "handl": [19, 29, 71], "happen": [1, 2, 55, 62, 63], "happi": 42, "harder": 21, "hashedg": [17, 50], "hasn": [31, 36], "hasneg": 50, "hasposit": 50, "hate": 31, "have": [0, 1, 2, 10, 12, 16, 31, 34, 36, 37, 40, 41, 42, 45, 46, 50, 54, 59, 60, 61, 62, 71], "haven": [31, 36], "he": [1, 31, 36], "header": 18, "hear": 32, "heart": [61, 62], "heat": 1, "heavi": 62, "hedg": [11, 30, 39, 49, 50, 64], "hei": [1, 35, 45, 46, 50], "helena": [47, 62], "hello": [0, 43, 49], "help": [0, 31, 34, 36, 43, 45, 46, 52, 58, 69], "helper": [23, 67], "her": [30, 31, 36], "here": [1, 29, 31, 34, 41, 42, 47, 61, 62, 66], "herself": [31, 36], "hesit": [60, 64], "hi": [31, 35, 36, 43, 45, 46], "hierach": 71, "hierarch": 71, "high": [0, 1, 2, 61, 62, 71], "higher": [21, 31, 34, 36, 40, 41, 42, 44, 45, 46, 55, 60], "highest": 71, "highlight": 1, "him": [31, 36], "himself": [31, 36], "hmm": [31, 36], "hoc": 62, "hold": 31, "hole": 62, "home": 42, "homework": 34, "homonym": 31, "hood": 1, "hope": 35, "host": [45, 46], "hour": 48, "how": [1, 5, 28, 29, 30, 31, 34, 35, 36, 39, 43, 45, 51, 52, 54, 56, 62], "howev": [0, 1, 3, 35, 40, 42, 44, 54, 56, 61, 62], "howitwork": 1, "html": [1, 2, 15, 17, 24, 61], "http": [1, 2, 4, 5, 6, 12, 13, 15, 16, 17, 18, 21, 24, 41, 45, 46, 47, 61, 64, 68, 70, 71], "hu": [1, 42, 62], "hug": [1, 51, 61], "huggingfac": 1, "huh": [31, 32, 36], "human": [37, 50, 62], "hyperlink": 48, "hyphen": [1, 61], "hypothet": 42, "i": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 19, 20, 21, 22, 23, 24, 25, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 71, 73], "iby1": 5, "id": [2, 4, 7, 23, 28, 62, 66, 68, 71, 72, 73], "idea": [12, 35, 40, 47, 51], "ident": [34, 35], "identif": 1, "identifi": [0, 1, 2, 4, 8, 9, 15, 23, 25, 30, 31, 41, 47, 50, 52, 61, 63, 64, 71, 72], "identiif": [13, 71], "ignor": [1, 32], "illustr": [1, 41, 48, 62], "imagin": 1, "immedi": [31, 35, 64], "impact": [1, 60], "impersonal_pronoun": 49, "implement": 64, "impli": 37, "import": [31, 32, 36, 44, 45, 62, 69], "incent": 13, "includ": [0, 1, 2, 10, 17, 22, 31, 32, 35, 36, 42, 45, 46, 51, 52, 56, 61, 62, 66, 71], "inclus": [13, 71], "incongru": [8, 34], "incorpor": [1, 42, 45, 46], "increas": [1, 42, 62], "increment": 71, "independ": 1, "index": [1, 2, 4, 13, 25, 37, 39, 55, 61, 65], "indic": [1, 2, 16, 21, 22, 30, 32, 34, 35, 36, 40, 41, 43, 44, 48, 49, 50, 52, 55, 60, 63, 71], "indirect": 50, "indirect_btw": 50, "indirect_greet": 50, "indirectli": 69, "individu": [0, 1, 5, 11, 31, 34, 37, 45, 50, 59, 60, 62, 72], "inequ": 37, "infer": [1, 51, 67], "influenc": 1, "info": [13, 18, 64], "info_divers": 13, "info_exchang": 64, "info_exchange_wordcount": [41, 64], "info_exchange_zscor": 11, "inform": [6, 11, 12, 13, 24, 32, 34, 39, 48, 62, 64, 65], "informal_titl": 49, "information_divers": 11, "initi": [2, 62, 63, 64, 65, 66], "input": [0, 2, 4, 6, 12, 13, 14, 15, 16, 19, 20, 22, 28, 50, 55, 60, 62, 63, 64, 65, 66, 67, 71, 72], "input_column": [65, 66], "input_data": [25, 68, 72], "input_df": [1, 2, 61, 71], "inquiri": [30, 39, 52], "insid": 1, "insight": 1, "inspir": 15, "instal": [1, 61, 62], "instanc": [1, 22, 50, 59, 66], "instanti": 2, "insteac": 1, "instead": [1, 2, 62], "instruct": [1, 61], "int": [2, 3, 10, 13, 15, 16, 19, 20, 22, 28, 63, 64, 67], "intact": 71, "integ": [0, 13, 40, 47], "intend": 59, "interact": [1, 11, 43, 44, 62, 69], "interconnect": 62, "interest": [1, 61, 62], "interfac": 62, "intermedi": [59, 64], "intern": 29, "interpret": [0, 23], "interrupt": 59, "interv": [58, 65], "introduc": 62, "introduct": [11, 61], "invalid": 67, "invers": 64, "involv": [41, 62, 65], "io": [1, 24, 47, 61], "ipynb": [0, 1], "is_hedged_sentence_1": 10, "isn": [1, 31, 36], "issu": [1, 31, 36, 37, 42, 61], "ital": 64, "italic": 22, "item": [0, 71], "its": [0, 2, 15, 31, 35, 36, 40, 41, 47, 54, 55, 64, 69], "itself": [23, 31, 36, 44], "john": 1, "jonson": 62, "journal": [5, 64], "json": [1, 61], "jurafski": 70, "juri": 1, "juries_df": 1, "jury_conversations_with_outcome_var": 1, "jury_feature_build": 1, "jury_output": 1, "jury_output_chat_level": [1, 61], "jury_output_turn_level": 1, "just": [1, 2, 31, 36, 46, 50, 59, 61, 62], "katharina": 34, "keep": [1, 71], "kei": [1, 2, 4, 19, 28, 30, 54, 61, 71], "keyerror": 71, "keyword": [19, 49], "kind": [10, 62], "kitchen": 42, "knob": 0, "know": [1, 30], "knowledg": 29, "known": [1, 32, 61], "kumar": 62, "kw": 19, "lab": [1, 2, 62, 71], "label": [1, 15, 21, 51], "lack": [31, 38, 45, 46], "languag": [15, 31, 34, 42, 50, 62], "larg": [31, 69], "larger": [0, 31, 61], "last": [1, 31], "late": 32, "later": [0, 1, 2, 42, 61], "latest": [1, 61], "latter": [31, 36], "lda": [13, 40], "learn": [1, 61, 62], "least": [10, 32, 42, 63, 67], "led": 62, "legal": 49, "lemmat": [13, 40], "len": 28, "length": [35, 39, 41, 42, 44], "less": [13, 32, 50, 52, 55, 62, 63], "let": [41, 49, 53], "let_me_know": 49, "letter": [49, 71], "level": [0, 1, 2, 3, 4, 6, 7, 8, 9, 12, 13, 14, 16, 23, 61, 64, 65, 66, 71, 72], "lexic": [10, 12, 14, 16, 31, 32, 36, 42, 60, 62, 64], "lexical_featur": [14, 64], "lexical_features_v2": [10, 11], "lexicon": [5, 10, 14, 30, 39, 50, 52, 67, 69], "lexicons_dict": 67, "librari": [34, 51, 56, 57], "lift": 62, "light": 61, "like": [1, 22, 31, 34, 36, 41, 50, 61, 62], "limiat": 32, "limit": [11, 32, 37, 42, 54], "line": [0, 1, 19, 22, 48, 61, 62, 64], "linear": 64, "linguist": [18, 19, 30, 39, 50, 52], "link": [22, 29, 48, 50, 64], "list": [1, 2, 6, 7, 10, 11, 12, 13, 15, 19, 20, 21, 22, 28, 31, 33, 36, 37, 42, 48, 49, 50, 53, 54, 61, 64, 65, 66, 67, 68, 70, 71], "literatur": 62, "littl": 38, "littlehors": 1, "liu": [42, 52], "live": [1, 54], "liwc": [14, 30, 39, 51, 52, 56, 62], "liwc_featur": [10, 14], "lix": 34, "ll": [1, 31, 36, 61], "load": [19, 69], "load_saved_data": 19, "load_to_dict": 19, "load_to_list": 19, "loc": 15, "local": [1, 51, 61], "locat": [1, 62], "long": [4, 42], "longer": [30, 41, 43, 48, 61, 62], "look": [2, 34, 61, 65, 66], "loos": 36, "lot": [31, 36], "loud": 60, "love": [31, 56], "low": [1, 2, 29, 55, 60, 71], "lower": [21, 31, 33, 36, 41, 42, 44, 55, 60], "lowercas": [2, 13, 40, 48, 49, 71], "lowest": 71, "lpearl": 16, "lst": 6, "m": [0, 2, 23, 30, 31, 36], "made": [1, 23, 35, 59, 61, 62], "magnitud": 55, "mai": [1, 2, 11, 28, 31, 32, 35, 36, 37, 41, 42, 43, 44, 54, 61, 62, 71], "main": [1, 2, 5, 62, 64, 65, 66], "make": [1, 5, 31, 34, 55, 56, 62, 66, 69, 71], "man": 62, "mani": [1, 4, 11, 32, 37, 41, 60, 62, 66], "manner": [55, 62], "manual": [1, 61], "map": [13, 34], "mark": [19, 20, 22, 43, 54, 64, 71], "marker": [18, 32, 39, 42, 50, 51, 52, 54, 56], "marlow": 44, "matarazzo": 62, "match": [1, 5, 16, 19, 30], "math": 34, "matter": [28, 47], "max": 66, "max_num_chunk": 63, "maxim": [34, 35, 37], "maximum": [63, 65, 72], "mayb": [38, 47], "mcfarland": 70, "me": [31, 32, 36, 41, 50, 53], "mean": [0, 1, 4, 6, 11, 13, 21, 29, 31, 34, 36, 40, 41, 42, 47, 55, 56, 58, 61, 62, 65, 66, 73], "meaning": [31, 41, 55], "meaningless": 41, "meant": 39, "measur": [0, 7, 12, 13, 20, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 51, 52, 54, 55, 56, 57, 58, 59, 60, 62, 64, 68], "mechan": 32, "medium": 21, "meet": 48, "member": [13, 34, 37, 55], "merg": [2, 8, 65, 66], "merge_conv_data_with_origin": 2, "messag": [0, 1, 2, 3, 4, 5, 8, 11, 12, 13, 14, 15, 16, 17, 20, 21, 22, 23, 24, 28, 30, 31, 34, 35, 36, 37, 39, 41, 45, 46, 47, 48, 50, 51, 52, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 71, 73], "messaga": 61, "message_col": [0, 1, 2, 12, 13, 14, 61, 64, 65, 67, 71], "message_embed": [6, 7, 8], "message_lower_with_punc": 71, "metadata": [0, 1], "method": [5, 31, 41, 50, 62], "metric": [0, 1, 2, 8, 30, 34, 35, 46, 47, 48, 55, 66], "michael": 1, "mid": [1, 2, 71], "middl": [21, 34, 63], "might": [0, 1, 29, 43, 48, 53], "mikeyeoman": [18, 64], "mileston": 34, "millisecond": [0, 2], "mimic": [28, 31, 36, 45], "mimic_word": 28, "mimick": [28, 31, 64], "mimicri": [0, 1, 28, 31, 35, 36, 39, 61, 64], "mimicry_bert": [45, 46], "mind": [1, 35, 50], "mine": [31, 36, 53, 59], "minim": [0, 41, 60], "minimum": [65, 72], "minu": [12, 41, 64], "minut": [55, 58], "mirror": 1, "miss": [1, 32, 61, 71], "mitig": [31, 36], "mizil": 50, "mm": [31, 36], "mnsc": 6, "modal": 50, "mode": 60, "model": [1, 13, 15, 31, 34, 35, 36, 40, 45, 46, 47, 51, 62, 67], "modif": 35, "modifi": [1, 9, 19, 32, 64], "modul": [0, 1, 11, 34, 49, 61, 69], "monologu": 59, "more": [0, 1, 2, 11, 12, 22, 23, 24, 28, 31, 32, 34, 36, 37, 40, 41, 42, 43, 44, 45, 46, 50, 52, 55, 59, 61, 62, 71], "morn": 1, "most": [24, 31, 55, 62, 69], "motiv": 61, "move": [0, 1, 28, 31, 36, 39, 45, 59, 61], "movi": 31, "much": [1, 28, 31, 34, 35, 36, 45, 62], "multi": [1, 2, 71], "multidimension": [45, 46], "multipl": [0, 1, 2, 19, 62, 71], "must": [1, 6, 62, 71], "my": [30, 31, 35, 36, 45, 46, 50, 53], "my_chat_featur": 1, "my_feature_build": 61, "my_fil": 1, "my_output": 61, "my_output_chat_level": 61, "my_output_conv_level": 61, "my_output_user_level": 61, "my_pandas_datafram": 61, "myself": [31, 36, 53], "n": [0, 2, 35, 45, 46, 47, 57, 59, 60], "n_chat": 59, "na": [5, 33, 43, 44, 48, 49, 50, 53, 58], "naiv": [2, 20, 32, 34, 38, 39, 53, 56, 57, 64], "name": [0, 2, 4, 7, 8, 9, 12, 13, 14, 15, 17, 19, 23, 25, 28, 30, 32, 35, 39, 45, 46, 50, 51, 56, 63, 64, 66, 67, 68, 71, 72, 73], "name_to_train": 47, "named_ent": [15, 47], "named_entity_recognition_featur": 11, "nan": [0, 34, 67], "nate": [35, 45, 46], "nathaniel": [35, 45, 46], "nativ": 50, "natur": [43, 55], "ndarrai": 68, "nearest": [13, 40], "nearli": 62, "necessari": [63, 67], "need": [0, 1, 2, 21, 62, 66, 67], "need_sent": 67, "need_senti": 67, "neg": [1, 24, 29, 31, 34, 35, 36, 42, 50, 51, 52, 54, 56, 61, 62, 67], "negat": [19, 49], "negative_bert": [1, 51, 61], "negative_emot": [49, 51, 52, 56], "negoti": 62, "neighborhood": 54, "neither": 30, "ner": 15, "ner_cutoff": [0, 1, 2, 47, 64], "ner_train": 64, "ner_training_df": [0, 1, 2, 47, 64], "nest": [0, 1, 2, 22, 71], "net": [45, 46], "network": 11, "neutral": [1, 5, 24, 30, 51, 55, 61, 67], "neutral_bert": [1, 51, 61], "never": 1, "new": [1, 4, 13, 34, 61, 64, 65, 66, 72], "new_column_nam": 72, "next": [1, 32, 47, 58], "nice": [1, 50, 54, 61], "nicknam": 1, "niculescu": 50, "night": 31, "nikhil": [59, 62], "nltk": [1, 42, 61], "nobodi": [31, 36], "nois": 32, "non": [1, 2, 28, 31, 37, 48, 61, 62, 71], "none": [1, 2, 19, 23, 37, 55, 61, 64, 65, 66, 67], "nor": 30, "normal": [19, 28, 31], "notabl": 62, "note": [0, 1, 2, 12, 16, 20, 42, 61, 71], "notebook": [0, 1], "noth": [31, 36, 56], "noun": 1, "novel": [45, 46], "now": [0, 1, 2], "nowher": [31, 36], "np": [67, 68], "ntri": 32, "null": 34, "num": 48, "num_char": 65, "num_chunk": [27, 63], "num_hedge_word": 10, "num_messag": 65, "num_named_ent": [15, 47], "num_row": 63, "num_top": 13, "num_word": [12, 16, 65], "number": [0, 3, 11, 12, 13, 14, 15, 16, 19, 20, 21, 22, 23, 25, 28, 31, 32, 34, 36, 37, 40, 41, 42, 43, 44, 47, 48, 49, 54, 56, 58, 59, 60, 62, 63, 64, 66, 69, 71, 72], "numer": [0, 1, 2, 13, 33, 68, 72, 73], "numpi": [1, 61, 68], "o": 35, "object": [1, 2, 19, 44, 50, 57, 58, 61, 62, 64, 65, 66], "obtain": [1, 13, 17, 23, 24, 34, 61], "occur": [0, 4, 31, 42, 71], "occurr": 19, "off": [1, 31, 36], "offer": 0, "offici": 61, "often": [28, 36, 47, 48, 62], "oh": [31, 36, 48], "okai": [31, 36], "older": [1, 61], "on_column": [18, 23, 28, 68, 72, 73], "onc": [1, 2, 11, 58, 61, 62], "one": [0, 1, 2, 4, 10, 12, 19, 23, 25, 28, 29, 31, 32, 36, 37, 47, 51, 56, 59, 61, 62, 67, 68, 71, 73], "ones": [31, 36], "onli": [0, 1, 2, 5, 11, 23, 29, 31, 32, 34, 36, 37, 45, 53, 58, 59, 61, 62, 71], "onlin": [1, 32, 39, 64], "onward": 0, "open": [0, 62, 66], "operation": [39, 50, 59], "opinion": [24, 31], "oppos": [2, 31, 34, 35, 55], "opposit": 34, "option": [1, 2, 37, 62, 63, 67, 71], "order": [0, 1, 35, 37, 42, 71], "org": [2, 6, 15, 21, 24, 41, 70], "organ": 1, "origin": [1, 2, 5, 12, 21, 31, 32, 35, 36, 37, 45, 46, 49, 59], "orthogon": 34, "other": [1, 2, 9, 11, 28, 29, 30, 31, 32, 34, 35, 36, 37, 39, 40, 45, 46, 48, 51, 52, 54, 56, 58, 59, 61, 62, 64, 66, 71], "other_lexical_featur": [11, 64], "otherwis": [2, 10, 21, 23, 32, 38, 63, 67], "our": [0, 1, 2, 11, 13, 29, 31, 32, 36, 37, 39, 53, 59, 61, 71], "ourselv": 53, "out": [1, 2, 16, 19, 31, 36, 55, 60, 62], "outcom": [1, 44, 62], "output": [0, 2, 10, 17, 19, 40, 61, 62, 64, 67], "output_file_bas": [0, 1, 2, 61], "output_file_path_chat_level": [1, 2], "output_file_path_conv_level": [1, 2], "output_file_path_user_level": [1, 2], "output_path": 67, "outsid": [1, 2, 12], "over": [1, 16, 29, 31, 34, 35, 36, 37, 53, 55, 60, 62, 71], "overal": [30, 31, 34, 36, 45, 46], "overrid": [0, 1, 2], "overview": [0, 1, 61, 62], "overwritten": 1, "own": [0, 1, 9, 35, 62], "p": 55, "pacakg": 24, "pace": [43, 62], "packag": [17, 18, 40, 62], "pad": 19, "page": [1, 11, 29, 39, 61, 62, 69], "pair": [6, 19, 34, 49, 71], "pairwis": [6, 34], "panda": [0, 1, 2, 12, 14, 16, 23, 47, 64, 65, 66, 71, 72, 73], "paper": [4, 5, 12, 18, 29, 40, 50, 64], "paragraph": 22, "param": 71, "paramet": [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 28, 47, 61, 62, 63, 64, 65, 66, 67, 68, 71, 72, 73], "paramt": 1, "pardon": 32, "parenthes": [22, 48, 64], "parenthet": [22, 48], "pars": [16, 50, 60], "part": [1, 10, 13, 29, 36, 42, 52, 71], "particip": [1, 9, 37, 62], "particl": [31, 36], "particular": [11, 31, 32, 34, 41, 45, 47, 51, 59, 62], "particularli": 42, "partner": 32, "pass": [1, 13, 21, 47, 71], "path": [1, 2, 19, 61, 67], "path_in": 19, "pattern": [4, 11, 19, 55, 62, 67], "paus": 4, "pd": [1, 2, 4, 6, 7, 8, 9, 12, 13, 14, 15, 16, 18, 19, 23, 25, 63, 64, 65, 66, 67, 68, 71], "pdf": [5, 12, 13, 16, 18, 21, 24, 64, 70], "penalti": 1, "pennebak": [12, 37, 41, 42, 52], "pennyslvania": 62, "peopl": [1, 32, 59, 62], "per": [1, 6, 9, 19, 42, 63, 66, 72], "percentag": [2, 21], "perfect": [37, 59], "perform": [0, 1, 2, 16, 50], "perhap": 1, "period": [4, 34, 55], "person": [1, 8, 12, 15, 16, 32, 34, 39, 41, 42, 50, 59, 62, 64, 70], "perspect": 1, "petrocelli": 5, "phrase": [19, 30, 38, 54], "phrase_split": 19, "pickl": [19, 67], "piec": [36, 42, 59, 63], "pl": 50, "place": [55, 61, 62], "plan": [34, 35, 45, 46], "player": 59, "pleas": [0, 1, 38, 49, 50, 61, 62], "please_start": 50, "point": [22, 24, 34, 35, 45, 46, 48, 52, 64, 66], "poisson": 55, "polar": [24, 39, 51, 52, 64], "polit": [1, 17, 18, 30, 32, 38, 39, 42, 51, 52, 54, 56, 64], "politeness_featur": 11, "politeness_v2": 11, "politeness_v2_help": 11, "politenessstrategi": [17, 50], "portion": 0, "posit": [0, 1, 11, 15, 24, 29, 31, 39, 42, 50, 51, 54, 56, 61, 62, 64, 67], "positive_affect_lexical_per_100": [51, 52, 56], "positive_bert": [1, 51, 61], "positive_emot": [49, 51, 52, 56], "positivity_bert": [1, 61], "positivity_zscor": 64, "positivity_zscore_chat": 52, "positivity_zscore_convers": 52, "possess": 31, "possibl": [1, 34, 62, 66], "possibli": [38, 62], "practic": [34, 35], "pre": [1, 4, 21, 37, 49, 64], "preced": [31, 35, 71], "precend": 35, "precis": 47, "precomput": 51, "predefin": 19, "predetermin": [31, 36], "predict": [2, 47, 51, 64], "prefer": [0, 1], "preload_word_list": 69, "prep_simpl": 19, "prep_whol": 19, "preposit": [31, 36], "preproces": 48, "preprocess": [0, 1, 2, 13, 19, 40, 43, 49, 51, 61, 69], "preprocess_chat_data": 2, "preprocess_conversation_column": 71, "preprocess_naive_turn": 71, "preprocess_text": 71, "preprocess_text_lowercase_but_retain_punctu": 71, "presenc": [2, 32, 67], "present": [1, 2, 14, 30, 31, 38, 42, 55, 62, 71], "prespecifi": 19, "prevent": 51, "previou": [1, 7, 28, 31, 36, 45, 46, 58, 64, 71], "primari": 34, "print": 2, "prior": [2, 64, 71], "priya": [47, 62], "probabl": [15, 47], "problem": 62, "procedur": 62, "proceed": 46, "process": [0, 1, 2, 4, 10, 21, 37, 55, 62, 64, 65, 67, 69, 71], "prodi": 15, "produc": [2, 34], "product": 15, "professor": 62, "progress": [1, 2], "project": [54, 62], "pronoun": [12, 16, 31, 36, 39, 41, 42, 64, 70], "proper": 1, "properti": [1, 61], "proport": [16, 39, 42, 64], "propos": 37, "provid": [0, 1, 2, 15, 29, 30, 33, 36, 39, 44, 47, 54, 62], "proxi": 42, "pseudonym": 1, "psycholog": 42, "pub": 70, "publish": [5, 30, 64], "pubsonlin": 6, "punctuat": [0, 2, 16, 19, 20, 21, 28, 43, 54, 60, 71], "punctuation_seper": 19, "puncut": 48, "pure": [24, 36], "purpos": 1, "put": [34, 50, 62, 66], "py": [0, 1, 14, 49, 61], "pydata": 2, "pypi": [1, 61], "python": [1, 32, 41, 56, 57, 61, 62, 68], "qtd": 62, "qualiti": 41, "quantifi": [31, 36, 62], "quantiti": [37, 39, 41, 47], "quartil": 50, "question": [16, 19, 20, 29, 32, 39, 49, 50, 64, 66, 68, 70], "question_num": 11, "question_word": 20, "quick": [1, 43], "quickli": 0, "quit": 40, "quot": [22, 48, 64], "quotat": [22, 48], "rabbit": 62, "rain": 41, "rais": [67, 71], "random": 55, "rang": [5, 8, 24, 30, 33, 34, 35, 40, 51, 53, 55, 56, 57], "ranganath": [16, 31, 32, 36, 38, 43, 54, 70], "ranganath2013": 70, "ranganathetal2013_detectingflirt": 16, "rapid": [1, 4], "rare": [34, 35], "rate": [42, 51], "rather": [1, 31, 34, 35, 36, 37, 45, 46, 63], "ratio": [16, 39, 64], "raw": [0, 12, 16, 21, 31, 33, 42, 50, 64], "re": [1, 31, 36, 42, 50, 61], "read": [0, 1, 2, 16, 21, 29, 33, 61, 62, 64, 65, 66, 67], "read_csv": 1, "read_in_lexicon": 67, "readabl": [11, 33, 64, 70], "reader": 33, "readi": 1, "readili": 62, "readthedoc": [1, 24, 61], "real": [1, 55], "realit": 13, "realli": [31, 36, 50], "reason": [31, 36, 45, 46, 49], "reassur": 49, "recal": 47, "recept": [18, 32, 39, 42, 50, 51, 52, 54, 56, 62, 64], "recogn": [1, 43, 47], "recognit": [0, 1, 2, 39, 64], "recommend": [0, 42, 62], "reddit": [48, 64], "reddit_tag": 11, "redditus": 48, "reduc": 63, "reduce_chunk": 63, "redund": [42, 62], "refer": [0, 1, 2, 11, 22, 24, 28, 31, 42, 48, 52, 61, 62, 64, 70], "reflect": [37, 43], "regardless": 1, "regener": [0, 2, 51, 67], "regenerate_vector": [0, 1, 2, 67], "regex": [14, 16, 49], "regist": 37, "regress": 1, "regular": [5, 14, 30, 32, 42, 55, 58], "reichel": [53, 58, 60], "reidl": [4, 13], "reinvent": 62, "rel": [41, 51, 52, 55, 60, 64], "relat": [1, 61, 62, 64], "relationship": 36, "relev": [1, 29, 42, 44, 49, 51, 56, 61, 64, 65], "reli": [31, 34, 35, 36, 69], "reliabl": [33, 42], "remain": [1, 30, 71], "rememb": 1, "remov": [0, 2, 9, 13, 19, 28, 40, 43, 48, 49, 50, 71], "remove_active_us": 9, "renam": 1, "repair": [16, 39], "repeat": [60, 71], "repetit": 60, "replac": 19, "report": [1, 61], "repres": [2, 4, 6, 7, 11, 13, 23, 31, 34, 36, 42, 45, 46, 66, 67, 68, 71, 72, 73], "represent": [34, 38, 67], "reproduc": [36, 62], "republican": 1, "request": [32, 50, 51], "requir": [0, 1, 20, 21, 31, 55, 61, 62, 64, 65, 66, 67], "research": [1, 2, 62], "reserv": 0, "resolv": 62, "resourc": [1, 39, 48, 61, 62], "respect": [1, 2, 12, 31, 36, 37, 69], "respons": [22, 48, 55, 58, 64], "restaur": [34, 56], "restor": 0, "restrict": 71, "result": [40, 55, 65, 72], "retain": [2, 16, 20, 21, 60, 71], "retriev": 50, "retunr": 3, "return": [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 28, 30, 32, 43, 49, 50, 51, 55, 62, 63, 64, 65, 66, 67, 68, 70, 71, 72, 73], "reveal": 62, "revert": 50, "review": 62, "rewrit": 50, "rich": 62, "riedl": [13, 40, 55], "right": [31, 36, 61, 62], "roberta": [1, 39, 42, 52, 56, 61, 64, 67], "robust": 13, "rocklag": [5, 30, 64], "room": 59, "root": [13, 40], "rough": [12, 54], "roughli": 31, "round": [13, 40, 59, 71], "round_num": 1, "row": [0, 1, 2, 9, 13, 25, 37, 40, 59, 63, 68, 71, 72, 73], "rowbotham": 62, "rucker": 5, "rule": [1, 69], "run": [0, 10, 12, 16, 35, 46, 47, 48, 51, 61, 69], "runtim": [1, 35], "sagepub": [5, 64], "sai": [1, 32, 50, 59], "said": [1, 36, 62], "same": [0, 1, 2, 31, 34, 37, 45, 48, 52, 59, 60, 62, 71], "sampl": [61, 62], "sarcast": 48, "save": [0, 1, 2, 19, 64, 67], "save_featur": 2, "sbert": [1, 28, 31, 34, 35, 36, 45, 46, 64, 65, 67], "scale": [42, 51], "schema": 1, "scheme": 0, "school": [21, 62], "scienc": [29, 39, 62], "scientist": [61, 62], "score": [1, 4, 5, 11, 12, 13, 15, 21, 24, 28, 29, 30, 31, 34, 35, 36, 38, 39, 40, 45, 46, 47, 51, 53, 56, 57, 61, 64, 65, 67, 73], "script": [1, 61], "sea": 1, "seamless": 62, "search": [19, 61], "second": [0, 1, 4, 34, 42, 58, 59], "second_person": 49, "secr": [18, 49, 64], "section": [1, 29, 61], "see": [0, 1, 2, 30, 34, 38, 41, 45, 46, 47, 55, 62, 71], "seek": [5, 62], "segment": [0, 19], "select": [2, 4, 23, 28, 36, 45, 66, 67, 68, 71, 72, 73], "self": [1, 2, 61], "semant": [31, 34, 35, 41], "semantic_group": [1, 61], "send": [1, 37, 55], "sens": [5, 31, 54, 66], "sent": [1, 37, 64], "sentenc": [0, 1, 10, 15, 19, 20, 21, 33, 34, 35, 36, 42, 45, 46, 47, 48, 54, 56, 61, 67], "sentence_pad": 19, "sentence_split": 19, "sentence_to_train": 47, "sentencis": 19, "sentiment": [0, 1, 24, 31, 39, 42, 52, 56, 61, 62, 64, 67], "separ": [1, 2, 19, 34, 51], "sepcifi": 1, "septemb": 40, "sequenc": [1, 59], "sequenti": 1, "seri": [12, 16, 23, 28, 42, 71, 73], "serv": 12, "set": [0, 1, 2, 13, 23, 34, 48, 59], "set_self_conv_data": 2, "sever": [1, 30, 41, 42, 48, 51, 56, 61], "shall": 54, "share": [31, 36, 37], "she": [30, 31, 36], "shift": 34, "shop": 62, "short": [55, 58], "shorter": [13, 40, 41, 42, 43], "should": [0, 1, 2, 4, 14, 23, 28, 29, 31, 36, 47, 48, 54, 61, 62, 65, 66, 67, 68, 69, 71, 72, 73], "shouldn": [31, 36], "show": [1, 37, 61], "showeth": 62, "shruti": [35, 45, 46, 47, 62], "side": 31, "signal": [45, 55], "signifi": 42, "signific": [1, 61], "silent": 37, "similar": [1, 6, 7, 13, 28, 29, 31, 34, 35, 36, 40, 45, 46, 49, 62, 65], "similarli": [1, 35], "simpl": [0, 1, 16, 19, 42, 61, 62], "simpli": [1, 5, 11, 28, 56, 62], "simplifi": 1, "simplist": 41, "sinc": [1, 32, 41, 71], "singh": 62, "singl": [0, 1, 2, 11, 12, 19, 23, 31, 34, 35, 36, 37, 41, 45, 46, 59, 62, 71, 72], "singular": [12, 41, 64], "site": 16, "situat": 37, "size": [1, 13, 63, 67], "skip": 1, "slightli": [32, 62, 63], "slow": 1, "small": 40, "so": [1, 2, 10, 30, 31, 36, 37, 50, 61, 62, 66], "social": [29, 39, 61, 62], "socsci": 16, "softwar": 62, "sohi": 62, "sol3": 4, "solut": 59, "solv": 62, "some": [0, 1, 11, 17, 29, 32, 34, 35, 37, 41, 61, 63], "somebodi": [31, 36], "someon": [22, 29, 31, 36, 47, 48, 61, 64], "someplac": [31, 36], "someth": 47, "sometim": 1, "somewhat": 35, "soon": 62, "sorri": [16, 32, 50], "sort": 10, "sound": [47, 51], "sourc": [4, 5, 6, 12, 13, 16, 17, 21, 34, 35, 50, 64, 68], "space": [34, 40, 71], "spaci": [1, 19, 47, 49, 50, 61], "span": 63, "spars": 32, "speak": [1, 31, 36, 37, 59, 60, 62], "speaker": [0, 1, 2, 6, 8, 9, 25, 31, 34, 35, 37, 38, 42, 45, 46, 61, 66, 71, 72], "speaker_id": [2, 61, 72], "speaker_id_col": [0, 1, 2, 6, 8, 9, 25, 26, 27, 61, 65, 66, 71, 72], "speaker_nicknam": [0, 1, 2, 6, 9, 59, 66, 71], "special": [0, 1, 2, 48, 71], "specif": [1, 2, 12, 32, 41, 48, 55, 61, 62, 69, 71], "specifi": [1, 2, 19, 47, 49, 67, 68, 71, 72, 73], "speciifc": 63, "spend": [51, 62], "spike": 55, "split": [19, 21, 43, 63], "spoke": 59, "spoken": [11, 37], "spread": 55, "squar": [13, 40], "ssrn": 4, "stabl": 40, "stack": 14, "stackoverflow": 68, "stage": [1, 2, 34, 71], "stamp": 55, "standard": [1, 4, 37, 40, 41, 49, 55, 58, 60, 65, 72, 73], "stanford": 70, "start": [15, 19, 20, 22, 23, 50], "statement": [1, 38, 42, 47, 48, 61, 62, 64], "statist": [65, 66, 68], "statologi": 41, "stem": 42, "step": [1, 4, 28, 41, 45, 46, 51], "still": [41, 45, 46], "stochast": 40, "stop": [40, 62], "stopword": [13, 19], "store": [1, 12, 16, 41, 49, 51, 61, 65, 67], "stoword": 42, "str": [2, 3, 4, 5, 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 28, 63, 64, 65, 66, 67, 68, 71, 72, 73], "str_to_vec": 67, "str_vec": 67, "straightforward": 29, "strategi": [17, 30, 32, 38, 39, 42, 49, 54, 64], "stream": 35, "strictli": 1, "string": [0, 1, 2, 4, 8, 12, 13, 14, 19, 23, 24, 50, 66, 67, 68, 71, 72, 73], "strongli": [1, 41, 61], "structur": [0, 36, 49], "student": [21, 33], "studi": [1, 34, 62], "style": [1, 31, 36, 59], "sub": [0, 1, 71], "subfold": 1, "subject": [5, 24, 28, 39, 49, 64], "subjunct": 50, "sublist": 28, "submiss": 55, "subpart": [1, 71], "subsequ": [1, 30, 51, 58], "subset": 62, "substanc": 36, "substant": 31, "substanti": 1, "substr": 30, "subtask": 1, "subtract": [41, 58], "succe": 62, "success": [0, 1, 4, 31, 36, 43, 55, 58], "suggest": [1, 13, 34, 42, 44, 50], "suit": [62, 64], "sum": [1, 28, 34, 61, 64, 65, 66, 72], "summar": [0, 1, 65, 66, 69], "summari": [65, 66, 72], "summariz": [0, 65], "summarize_featur": 69, "suppl": 6, "support": [1, 15, 61], "suppos": 1, "sure": 30, "swear": 49, "syntax": [1, 32, 61], "system": [2, 59, 64], "t": [0, 1, 2, 15, 29, 31, 36, 45, 49, 54, 61, 62, 67], "tabl": 62, "tag": 39, "take": [1, 4, 5, 9, 14, 25, 29, 31, 34, 37, 39, 42, 55, 61, 65, 67, 71], "taken": [59, 71], "talk": [1, 37, 47, 59, 62], "tandem": [1, 61], "target": 15, "task": [1, 2, 59, 71], "tausczik": [12, 37, 41, 52], "tausczikpennebaker2013": 12, "team": [0, 1, 4, 11, 12, 13, 34, 39, 40, 59, 65], "team_bursti": 4, "team_comm_tool": [1, 61], "teamcommtool": 1, "technic": [29, 39, 61, 62], "teghxgbqdhgaaaaa": 5, "tempor": [0, 2, 55, 58, 64, 71], "temporal_featur": 11, "tend": [1, 34, 60], "term": [1, 28, 59], "termin": [1, 2, 61], "terribl": 51, "test": [13, 33, 47], "text": [0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 28, 32, 33, 36, 42, 48, 55, 62, 64, 67, 71], "text_based_featur": 64, "textblob": [24, 39, 51, 52, 64], "textblob_sentiment_analysi": 11, "than": [0, 1, 2, 11, 13, 31, 34, 35, 36, 37, 40, 41, 45, 46, 54, 60, 62, 63], "thee": 62, "thei": [0, 1, 28, 29, 31, 34, 36, 37, 39, 42, 47, 58, 59, 61, 62, 67], "them": [0, 1, 2, 19, 28, 29, 31, 36, 50, 51, 55, 59, 61, 62, 64, 65, 66, 67], "themselv": [31, 36, 60], "theoret": 35, "theori": [34, 50], "therefor": [0, 1, 11, 28, 37, 45, 59, 62, 69], "thi": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 16, 18, 20, 21, 23, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 71, 72, 73], "thing": [48, 61], "think": [1, 38, 47], "thorough": [43, 62], "those": [1, 21, 31, 36, 61], "though": [34, 42], "thought": [1, 35, 45], "thread": [1, 61], "three": [0, 1, 2, 22, 34, 37, 40, 51, 61, 62, 69, 71], "threshold": [15, 47], "through": [1, 45, 46, 50, 61, 62], "throughout": [31, 35, 36, 40, 45, 46, 55, 63], "tht": 35, "thu": [1, 2, 34, 35, 36, 37, 46, 55, 71], "time": [0, 1, 4, 23, 34, 35, 39, 42, 48, 51, 55, 59, 61, 62, 63, 64, 65, 66, 71], "time_diff": 55, "timediff": 4, "timestamp": [0, 1, 2, 8, 23, 58, 61, 62, 63, 64, 71], "timestamp_col": [0, 1, 2, 8, 61, 63, 64, 65, 71], "timestamp_end": [1, 23, 61, 64], "timestamp_start": [1, 23, 61, 64], "timestamp_unit": [0, 2, 23, 64], "to_datetim": [0, 2], "todai": [34, 35, 41, 43, 45, 46, 47], "todo": 66, "togeth": [0, 62, 66], "token": [16, 19, 39, 49, 54, 64], "token_count": [19, 49], "too": [30, 31, 36, 62], "took": [1, 59], "tool": [1, 61, 62], "toolkit": [0, 1, 11, 42, 45, 46, 55, 62], "top": [1, 50, 59], "topic": [1, 13, 31, 34, 40, 42, 43, 65], "tormala": 5, "total": [1, 3, 12, 16, 25, 31, 34, 36, 37, 41, 44, 53, 59, 60, 61, 62, 63, 64, 66, 72], "touch": [1, 61], "toward": [31, 36, 38, 42, 45, 46], "tradit": 49, "train": [1, 2, 15, 64], "train_spacy_n": 15, "transcript": 0, "transfom": [45, 46], "transform": [31, 34, 35, 36, 51], "transform_utter": 50, "treat": [0, 1, 59, 61], "tri": [50, 64], "trivial": [3, 44, 62], "troubl": [1, 61], "true": [0, 1, 2, 37, 61, 63, 67, 71], "truncat": 2, "truth_intensifi": 49, "ttr": 64, "tupl": [0, 1, 2, 15, 19, 64], "turn": [0, 2, 25, 28, 31, 32, 37, 39, 61, 64, 65, 71], "turn_count": 59, "turn_df": 71, "turn_id": 71, "turn_taking_featur": 11, "twice": 63, "twitter": [1, 51, 61], "two": [0, 1, 2, 23, 31, 34, 36, 41, 45, 46, 52, 62, 63], "txt": 19, "type": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 28, 37, 39, 52, 63, 64, 65, 66, 67, 68, 70, 71, 72, 73], "typic": [1, 34, 40, 41, 42, 52, 60], "u": [0, 1, 2, 22, 31, 36, 48, 49, 58], "uci": 16, "uh": [31, 36], "ulrich": 55, "um": [31, 36, 60], "umbrella": [8, 29, 34], "uncertain": [5, 30], "uncertainti": 30, "under": [0, 1, 10, 11, 12, 28, 40], "underli": [1, 61], "underscor": [1, 61], "understand": [0, 33, 39, 43, 48, 58, 61, 62], "understood": 33, "uninterrupt": 59, "uniqu": [0, 1, 2, 6, 9, 13, 16, 23, 25, 41, 47, 52, 60, 61, 63, 71], "unit": [0, 2, 23], "univers": 62, "unix": 58, "unless": [31, 36], "unpack": 62, "unpreprocess": [0, 2], "until": [31, 36, 45, 46], "unzip": [1, 61], "up": [1, 17, 21, 28, 31, 35, 36, 37, 45, 46, 51, 59, 61], "updat": [1, 9, 40, 54, 61], "upenn": 1, "upload": 13, "upon": 33, "upper": 42, "us": [0, 1, 2, 3, 5, 11, 12, 13, 17, 19, 24, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 56, 57, 60, 62, 64, 65, 66, 67, 71], "usag": [21, 24], "use_time_if_poss": 63, "user": [0, 1, 2, 9, 15, 22, 37, 47, 48, 51, 61, 62, 63, 64, 65, 66, 69, 72], "user_data": [2, 65, 66], "user_df": 9, "user_level_featur": 2, "user_list": 9, "userlevelfeaturescalcul": [2, 66, 69], "usernam": [22, 48], "utf": 1, "util": [1, 12, 21, 61, 62, 63, 64, 65, 66, 67, 68, 70, 71, 72, 73], "utilti": 62, "utter": [0, 1, 2, 3, 4, 5, 13, 14, 15, 16, 17, 20, 21, 23, 24, 30, 31, 32, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 50, 51, 52, 54, 58, 60, 61, 67], "utteranc": 42, "v": [0, 1, 2, 13, 42, 61], "v0": 0, "valenc": 51, "valid": [23, 55], "valu": [0, 1, 2, 5, 6, 10, 12, 13, 18, 19, 23, 28, 30, 31, 34, 36, 37, 40, 41, 42, 45, 46, 47, 55, 59, 61, 64, 67, 68, 71, 72, 73], "vari": [13, 31, 34, 35], "variabl": [1, 56, 57, 64, 65, 66], "varianc": [8, 34], "variance_in_dd": 11, "variat": [4, 32], "varieti": [42, 62], "variou": [19, 42, 64, 65, 66], "vast": 62, "ve": [0, 31, 36, 50, 61], "vec": 6, "vect_data": [1, 7, 8, 28, 61, 64, 65, 66], "vect_path": 67, "vector": [0, 2, 6, 7, 8, 13, 28, 34, 35, 40, 55, 61, 64, 65, 67], "vector_data": [0, 1, 2, 61], "vector_directori": [0, 1, 2, 61, 65], "vein": 45, "verb": [19, 31, 36], "verbal": 32, "veri": [5, 28, 30, 31, 34, 35, 36, 42, 49, 54], "verifi": 2, "verit": 62, "version": [1, 12, 14, 21, 28, 31, 40, 51, 61], "versu": [4, 29, 47, 55, 59], "via": [3, 44], "view": 50, "visit": 41, "voila": 62, "w": 31, "wa": [0, 1, 2, 5, 12, 31, 32, 35, 36, 47, 51, 56, 59, 62, 71], "wai": [1, 2, 29, 30, 31, 32, 34, 49, 50, 54, 56, 57, 61, 62, 66], "waiai": 62, "wait": [4, 55], "walk": 1, "walkthrough": [0, 61, 62], "want": [1, 28, 34, 59, 61, 62, 67], "warn": 50, "watt": [1, 2, 62, 71], "we": [0, 1, 2, 3, 4, 5, 9, 10, 11, 12, 15, 16, 18, 23, 24, 28, 29, 30, 31, 34, 35, 36, 37, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 52, 53, 55, 56, 57, 58, 59, 61, 62, 66, 67, 71], "web": 70, "websit": [1, 61], "week": 47, "weight": 66, "weigt": 31, "welcom": 61, "well": [29, 31, 36, 55, 62], "went": 41, "were": [1, 12, 31, 36, 42], "western": 1, "wh": [19, 31, 36], "wh_question": [32, 49, 54], "wharton": 62, "what": [1, 2, 12, 16, 20, 29, 31, 32, 34, 35, 36, 39, 41, 45, 46, 47, 50, 54, 62, 63], "whatev": [1, 31, 36], "wheel": 62, "when": [1, 16, 20, 31, 33, 36, 47, 54, 55, 59, 60, 61, 62, 69, 71], "whenev": 71, "where": [1, 2, 19, 20, 28, 31, 32, 36, 37, 40, 41, 42, 48, 50, 51, 54, 59, 61, 65, 68, 73], "wherea": [31, 34, 35, 36, 43], "wherev": [31, 36], "whether": [1, 2, 10, 16, 19, 32, 37, 38, 41, 43, 47, 57, 58, 62, 63, 64, 67, 71], "which": [0, 1, 2, 3, 4, 5, 7, 9, 12, 13, 15, 16, 18, 23, 25, 28, 31, 34, 35, 36, 37, 38, 40, 41, 42, 51, 53, 54, 55, 56, 57, 58, 59, 61, 62, 64, 66, 68, 69, 71, 72, 73], "while": [31, 32, 34, 36, 37, 44, 45, 46, 55, 62, 71], "whitespac": 43, "who": [20, 31, 32, 36, 47, 51, 54, 59, 60, 62], "whole": [28, 59, 62, 71], "whom": [31, 36, 54], "whose": [31, 36, 54], "why": [20, 29, 31, 36, 54], "wide": 31, "wien": 62, "wiki": [21, 29, 70], "wiki_link": [1, 61], "wikipedia": [21, 33, 37, 70], "williamson": 60, "wish": [1, 2, 18, 28], "within": [0, 1, 2, 8, 11, 16, 28, 30, 31, 34, 35, 36, 41, 45, 46, 52, 55, 59, 60, 62, 63, 64, 68, 71, 73], "within_group": 2, "within_person_discursive_rang": 11, "within_task": [0, 1, 2, 71], "without": [1, 19, 31, 36, 42, 47, 54, 62, 69], "won": [0, 31, 36, 45], "wonder": 56, "woolei": 4, "woollei": [13, 40, 55], "wooten": 55, "word": [3, 10, 11, 12, 13, 14, 16, 19, 20, 21, 22, 28, 30, 32, 33, 37, 38, 39, 40, 41, 43, 45, 46, 48, 49, 52, 53, 54, 56, 57, 62, 64, 65, 66, 69, 70], "word_mimicri": 11, "word_start": [19, 49], "wordnet": [1, 61], "words_in_lin": 19, "work": [0, 47, 50, 55, 61, 62], "world": 55, "worri": 62, "would": [1, 29, 31, 34, 35, 36, 37, 42, 50, 54, 62], "wouldn": [31, 36], "wow": 50, "wp": 13, "write": [2, 29, 60], "www": [12, 13, 18, 41, 64], "x": [0, 1, 2, 4, 46, 68], "xinlan": 62, "yashveer": 62, "ye": 19, "yeah": [31, 36], "yeoman": [18, 49], "yesno_quest": [32, 49, 54], "yet": 48, "ylatau": 12, "you": [0, 1, 2, 11, 24, 29, 31, 36, 37, 43, 47, 50, 59, 61, 62, 69], "your": [0, 29, 31, 32, 36, 37, 50, 59, 61, 62], "yourself": [31, 36, 50], "yuluan": 62, "yup": [31, 36], "yuxuan": 62, "z": [12, 39, 49, 51, 64, 73], "zero": [13, 52], "zhang": 62, "zheng": 62, "zhong": 62, "zhou": 62, "zscore": 41, "zscore_chat": 41, "zscore_chats_and_convers": 69, "zscore_convers": 41, "\u00bc": 47, "\u03c4": 55}, "titles": ["The Basics (Get Started Here!)", "Worked Example", "feature_builder module", "basic_features module", "burstiness module", "certainty module", "discursive_diversity module", "fflow module", "get_all_DD_features module", "get_user_network module", "hedge module", "Features: Technical Documentation", "info_exchange_zscore module", "information_diversity module", "lexical_features_v2 module", "named_entity_recognition_features module", "other_lexical_features module", "politeness_features module", "politeness_v2 module", "politeness_v2_helper module", "question_num module", "readability module", "reddit_tags module", "temporal_features module", "textblob_sentiment_analysis module", "turn_taking_features module", "variance_in_DD module", "within_person_discursive_range module", "word_mimicry module", "FEATURE NAME", "Certainty", "Content Word Accommodation", "Conversational Repair", "Dale-Chall Score", "Discursive Diversity", "Forward Flow", "Function Word Accommodation", "Gini Coefficient", "Hedge", "Features: Conceptual Documentation", "Information Diversity", "Information Exchange", "Linguistic Inquiry and Word Count (LIWC) and Other Lexicons", "Message Length", "Message Quantity", "Mimicry (BERT)", "Moving Mimicry", "Named Entity Recognition", "Online Discussion Tags", "Politeness/Receptiveness Markers", "Politeness Strategies", "Sentiment (RoBERTa)", "Positivity Z-Score", "Proportion of First Person Pronouns", "Question (Naive)", "Team Burstiness", "Textblob Polarity", "Textblob Subjectivity", "Time Difference", "Turn Taking Index", "Word Type-Token Ratio", "The Team Communication Toolkit", "Introduction", "assign_chunk_nums module", "calculate_chat_level_features module", "calculate_conversation_level_features module", "calculate_user_level_features module", "check_embeddings module", "gini_coefficient module", "Utilities", "preload_word_lists module", "preprocess module", "summarize_features module", "zscore_chats_and_conversation module"], "titleterms": {"A": 0, "One": 0, "The": [0, 61, 62], "accommod": [31, 36], "addit": 1, "advanc": 1, "assign_chunk_num": 63, "assumpt": 0, "basic": [0, 1, 29, 30, 31, 33, 34, 35, 36, 37, 38, 40, 41, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 55, 56, 57, 58, 59, 60], "basic_featur": 3, "bert": 45, "bursti": [4, 55], "calculate_chat_level_featur": 64, "calculate_conversation_level_featur": 65, "calculate_user_level_featur": 66, "caveat": [29, 30, 31, 33, 34, 35, 36, 38, 40, 41, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 55, 56, 57, 58, 59], "certainti": [5, 30], "chall": 33, "chat": [11, 39], "check_embed": 67, "citat": [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60], "class": 69, "code": [0, 1], "coeffici": 37, "column": [1, 61], "commun": 61, "conceptu": 39, "configur": 1, "consider": 1, "content": [31, 61], "convers": [1, 11, 32, 39, 62, 69], "count": [42, 59], "customiz": 0, "dale": 33, "data": 1, "declar": 61, "demo": [0, 1], "detail": 1, "differ": 58, "directori": 1, "discurs": 34, "discursive_divers": 6, "discuss": 48, "divers": [34, 40], "document": [11, 39, 62], "driver": 69, "entiti": 47, "environ": [1, 61], "exampl": [1, 41, 47], "exchang": 41, "featur": [1, 11, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 69], "feature_build": 2, "featurebuild": [1, 61, 62], "fflow": 7, "file": [1, 30, 34, 35, 45, 46, 47, 51], "first": 53, "flow": 35, "forward": 35, "function": [0, 36], "gener": [1, 61, 62], "get": [0, 1, 61, 62], "get_all_dd_featur": 8, "get_user_network": 9, "gini": 37, "gini_coeffici": 68, "hedg": [10, 38], "here": 0, "high": [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60], "implement": [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60], "import": [1, 61], "index": 59, "indic": 61, "info_exchange_zscor": 12, "inform": [1, 40, 41, 61], "information_divers": 13, "input": [1, 34], "inquiri": 42, "inspect": [1, 61], "interpret": [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60], "introduct": 62, "intuit": [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60], "kei": 0, "length": 43, "level": [11, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 62, 69], "lexical_features_v2": 14, "lexicon": 42, "light": 0, "linguist": 42, "liwc": 42, "marker": 49, "messag": [43, 44], "mimicri": [45, 46], "modul": [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 63, 64, 65, 66, 67, 68, 70, 71, 72, 73], "motiv": 62, "move": 46, "naiv": 54, "name": [1, 29, 47, 61], "named_entity_recognition_featur": 15, "note": [29, 30, 31, 33, 34, 35, 36, 38, 40, 41, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 55, 56, 57, 58, 59], "onlin": 48, "other": [42, 69], "other_lexical_featur": 16, "ouput": 34, "our": 62, "output": [1, 30, 35, 45, 46, 47, 51], "packag": [0, 1, 61], "paramet": [0, 1], "person": 53, "pip": [1, 61], "polar": 56, "polit": [49, 50], "politeness_featur": 17, "politeness_v2": 18, "politeness_v2_help": 19, "posit": 52, "preload_word_list": 70, "preprocess": 71, "pronoun": 53, "proport": 53, "quantiti": 44, "question": 54, "question_num": 20, "ratio": 60, "readabl": 21, "recept": 49, "recognit": 47, "recommend": [1, 61], "reddit_tag": 22, "relat": [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60], "repair": 32, "roberta": 51, "run": 1, "sampl": [0, 1], "score": [33, 41, 52], "sentiment": 51, "speaker": [11, 59, 62, 69], "start": [0, 1, 61, 62], "strategi": 50, "subject": 57, "summarize_featur": 72, "tabl": 61, "tag": 48, "take": 59, "team": [55, 61, 62], "technic": 11, "temporal_featur": 23, "textblob": [56, 57], "textblob_sentiment_analysi": 24, "time": 58, "token": 60, "toolkit": 61, "touch": 0, "train": 47, "troubleshoot": [1, 61], "turn": [1, 59], "turn_taking_featur": 25, "type": 60, "us": 61, "user": 11, "util": 69, "utter": [11, 39, 62, 69], "variance_in_dd": 26, "vector": 1, "virtual": [1, 61], "walkthrough": 1, "within_person_discursive_rang": 27, "word": [31, 36, 42, 60], "word_mimicri": 28, "work": 1, "your": 1, "z": [41, 52], "zscore_chats_and_convers": 73}}) \ No newline at end of file +Search.setIndex({"alltitles": {"A Light-Touch, One-Function Package": [[0, "a-light-touch-one-function-package"]], "Additional FeatureBuilder Considerations": [[1, "additional-featurebuilder-considerations"]], "Advanced Configuration Columns": [[1, "advanced-configuration-columns"]], "Aggregation Overview": [[1, "id2"]], "Analyzing First Percentage (%)": [[1, "analyzing-first-percentage"]], "Base Conversation-Level Features": [[11, "base-conversation-level-features"]], "Basic Input Columns": [[1, "basic-input-columns"]], "Certainty": [[30, null]], "Citation": [[29, "citation"], [30, "citation"], [31, "citation"], [32, "citation"], [33, "citation"], [34, "citation"], [35, "citation"], [36, "citation"], [37, "citation"], [38, "citation"], [40, "citation"], [41, "citation"], [42, "citation"], [43, "citation"], [44, "citation"], [45, "citation"], [46, "citation"], [47, "citation"], [48, "citation"], [49, "citation"], [50, "citation"], [51, "citation"], [52, "citation"], [53, "citation"], [54, "citation"], [55, "citation"], [56, "citation"], [57, "citation"], [58, "citation"], [59, "citation"], [60, "citation"]], "Configuring the FeatureBuilder": [[1, "configuring-the-featurebuilder"]], "Content Word Accommodation": [[31, null]], "Contents:": [[61, null]], "Conversation Parameters": [[1, "conversation-parameters"]], "Conversation-Level Aggregates": [[11, "conversation-level-aggregates"]], "Conversation-Level Features": [[11, "conversation-level-features"], [39, "conversation-level-features"]], "Conversational Repair": [[32, null]], "Cumulative Grouping": [[1, "cumulative-grouping"]], "Custom Aggregation": [[1, "custom-aggregation"]], "Custom Features": [[1, "custom-features"]], "Customizable Parameters": [[0, "customizable-parameters"]], "Dale-Chall Score": [[33, null]], "Declaring a FeatureBuilder": [[61, "declaring-a-featurebuilder"]], "Demo / Sample Code": [[0, "demo-sample-code"], [1, "demo-sample-code"]], "Discursive Diversity": [[34, null]], "Example Usage of Custom Aggregation Parameters": [[1, "example-usage-of-custom-aggregation-parameters"]], "Example:": [[41, "example"]], "FEATURE NAME": [[29, null]], "Feature Column Names": [[1, "feature-column-names"], [61, "feature-column-names"]], "Feature Documentation": [[62, "feature-documentation"]], "Feature Information": [[1, "feature-information"], [61, "feature-information"]], "Features: Conceptual Documentation": [[39, null]], "Features: Technical Documentation": [[11, null]], "Forward Flow": [[35, null]], "Function Word Accommodation": [[36, null]], "Generating Features: Utterance-, Speaker-, and Conversation-Level": [[62, "generating-features-utterance-speaker-and-conversation-level"]], "Getting Started": [[1, "getting-started"], [61, "getting-started"], [62, "getting-started"]], "Gini Coefficient": [[37, null]], "Hedge": [[38, null]], "High*Level Intuition": [[54, "high-level-intuition"]], "High-Level Intuition": [[29, "high-level-intuition"], [30, "high-level-intuition"], [31, "high-level-intuition"], [32, "high-level-intuition"], [33, "high-level-intuition"], [34, "high-level-intuition"], [35, "high-level-intuition"], [36, "high-level-intuition"], [37, "high-level-intuition"], [38, "high-level-intuition"], [40, "high-level-intuition"], [41, "high-level-intuition"], [42, "high-level-intuition"], [43, "high-level-intuition"], [44, "high-level-intuition"], [45, "high-level-intuition"], [46, "high-level-intuition"], [47, "high-level-intuition"], [48, "high-level-intuition"], [49, "high-level-intuition"], [50, "high-level-intuition"], [51, "high-level-intuition"], [52, "high-level-intuition"], [53, "high-level-intuition"], [55, "high-level-intuition"], [56, "high-level-intuition"], [57, "high-level-intuition"], [58, "high-level-intuition"], [59, "high-level-intuition"], [60, "high-level-intuition"]], "Implementation": [[32, "implementation"], [42, "implementation"], [52, "implementation"], [54, "implementation"]], "Implementation Basics": [[29, "implementation-basics"], [30, "implementation-basics"], [31, "implementation-basics"], [33, "implementation-basics"], [34, "implementation-basics"], [35, "implementation-basics"], [36, "implementation-basics"], [37, "implementation-basics"], [38, "implementation-basics"], [40, "implementation-basics"], [41, "implementation-basics"], [43, "implementation-basics"], [44, "implementation-basics"], [45, "implementation-basics"], [46, "implementation-basics"], [47, "implementation-basics"], [48, "implementation-basics"], [49, "implementation-basics"], [50, "implementation-basics"], [51, "implementation-basics"], [53, "implementation-basics"], [55, "implementation-basics"], [56, "implementation-basics"], [57, "implementation-basics"], [58, "implementation-basics"], [59, "implementation-basics"], [60, "implementation-basics"]], "Implementation Notes/Caveats": [[29, "implementation-notes-caveats"], [30, "implementation-notes-caveats"], [31, "implementation-notes-caveats"], [33, "implementation-notes-caveats"], [34, "implementation-notes-caveats"], [35, "implementation-notes-caveats"], [36, "implementation-notes-caveats"], [38, "implementation-notes-caveats"], [40, "implementation-notes-caveats"], [41, "implementation-notes-caveats"], [43, "implementation-notes-caveats"], [44, "implementation-notes-caveats"], [45, "implementation-notes-caveats"], [46, "implementation-notes-caveats"], [47, "implementation-notes-caveats"], [48, "implementation-notes-caveats"], [49, "implementation-notes-caveats"], [50, "implementation-notes-caveats"], [51, "implementation-notes-caveats"], [53, "implementation-notes-caveats"], [55, "implementation-notes-caveats"], [56, "implementation-notes-caveats"], [57, "implementation-notes-caveats"], [58, "implementation-notes-caveats"], [59, "implementation-notes-caveats"]], "Import Recommendations: Virtual Environment and Pip": [[1, "import-recommendations-virtual-environment-and-pip"], [61, "import-recommendations-virtual-environment-and-pip"]], "Important Notes and Caveats": [[1, "important-notes-and-caveats"]], "Importing the Package": [[1, "importing-the-package"]], "Indices and Tables": [[61, "indices-and-tables"]], "Information Diversity": [[40, null]], "Information Exchange": [[41, null]], "Input File": [[34, "id2"]], "Inspecting Generated Features": [[1, "inspecting-generated-features"], [61, "inspecting-generated-features"]], "Interpretation:": [[41, "interpretation"]], "Interpreting the Feature": [[29, "interpreting-the-feature"], [30, "interpreting-the-feature"], [31, "interpreting-the-feature"], [32, "interpreting-the-feature"], [33, "interpreting-the-feature"], [34, "interpreting-the-feature"], [35, "interpreting-the-feature"], [36, "interpreting-the-feature"], [37, "interpreting-the-feature"], [38, "interpreting-the-feature"], [40, "interpreting-the-feature"], [41, "interpreting-the-feature"], [42, "interpreting-the-feature"], [43, "interpreting-the-feature"], [44, "interpreting-the-feature"], [45, "interpreting-the-feature"], [46, "interpreting-the-feature"], [47, "interpreting-the-feature"], [48, "interpreting-the-feature"], [49, "interpreting-the-feature"], [50, "interpreting-the-feature"], [51, "interpreting-the-feature"], [52, "interpreting-the-feature"], [53, "interpreting-the-feature"], [54, "interpreting-the-feature"], [55, "interpreting-the-feature"], [56, "interpreting-the-feature"], [57, "interpreting-the-feature"], [58, "interpreting-the-feature"], [59, "interpreting-the-feature"], [60, "interpreting-the-feature"]], "Introduction": [[62, null]], "Key Assumptions and Parameters": [[0, "key-assumptions-and-parameters"]], "Linguistic Inquiry and Word Count (LIWC) and Other Lexicons": [[42, null]], "Message Length": [[43, null]], "Message Quantity": [[44, null]], "Mimicry (BERT)": [[45, null]], "Motivation": [[62, "motivation"]], "Moving Mimicry": [[46, null]], "Named Entity Recognition": [[1, "named-entity-recognition"], [47, null]], "Named Entity Training Examples": [[47, "id2"]], "Online Discussion Tags": [[48, null]], "Other Utilities": [[69, "other-utilities"]], "Ouput File": [[34, "id3"]], "Our Team": [[62, "our-team"]], "Output File": [[30, "id2"], [35, "id2"], [45, "id2"], [46, "id2"], [47, "id3"], [51, "id1"]], "Output File Naming Details": [[1, "output-file-naming-details"]], "Package Assumptions": [[0, "package-assumptions"]], "Politeness Strategies": [[50, null]], "Politeness/Receptiveness Markers": [[49, null]], "Positivity Z-Score": [[52, null]], "Proportion of First Person Pronouns": [[53, null]], "Question (Naive)": [[54, null]], "Regenerating Vector Cache": [[1, "regenerating-vector-cache"]], "Related Features": [[29, "related-features"], [30, "related-features"], [31, "related-features"], [32, "related-features"], [33, "related-features"], [34, "related-features"], [35, "related-features"], [36, "related-features"], [37, "related-features"], [38, "related-features"], [40, "related-features"], [41, "related-features"], [42, "related-features"], [43, "related-features"], [44, "related-features"], [45, "related-features"], [46, "related-features"], [47, "related-features"], [48, "related-features"], [49, "related-features"], [50, "related-features"], [51, "related-features"], [52, "related-features"], [53, "related-features"], [54, "related-features"], [55, "related-features"], [56, "related-features"], [57, "related-features"], [58, "related-features"], [59, "related-features"], [60, "related-features"]], "Sentiment (RoBERTa)": [[51, null]], "Speaker Turn Counts": [[59, "id2"]], "Speaker- (User) Level Features": [[11, "speaker-user-level-features"]], "Table of Contents": [[61, "table-of-contents"]], "Team Burstiness": [[55, null]], "Textblob Polarity": [[56, null]], "Textblob Subjectivity": [[57, null]], "The Basics (Get Started Here!)": [[0, null]], "The FeatureBuilder": [[62, "the-featurebuilder"]], "The Team Communication Toolkit": [[61, null]], "Time Difference": [[58, null]], "Troubleshooting": [[1, "troubleshooting"], [61, "troubleshooting"]], "Turn Taking Index": [[59, null]], "Turns": [[1, "turns"]], "Using the Package": [[61, "using-the-package"]], "Utilities": [[69, null]], "Utterance- (Chat) Level Features": [[11, "utterance-chat-level-features"], [39, "utterance-chat-level-features"]], "Vector Directory": [[1, "vector-directory"]], "Walkthrough: Running the FeatureBuilder on Your Data": [[1, "walkthrough-running-the-featurebuilder-on-your-data"]], "Word Type-Token Ratio": [[60, null]], "Worked Example": [[1, null]], "assign_chunk_nums module": [[63, null]], "basic_features module": [[3, null]], "burstiness module": [[4, null]], "calculate_chat_level_features module": [[64, null]], "calculate_conversation_level_features module": [[65, null]], "calculate_user_level_features module": [[66, null]], "certainty module": [[5, null]], "check_embeddings module": [[67, null]], "discursive_diversity module": [[6, null]], "feature_builder module": [[2, null]], "fflow module": [[7, null]], "get_all_DD_features module": [[8, null]], "get_user_network module": [[9, null]], "gini_coefficient module": [[68, null]], "hedge module": [[10, null]], "info_exchange_zscore module": [[12, null]], "information_diversity module": [[13, null]], "lexical_features_v2 module": [[14, null]], "named_entity_recognition_features module": [[15, null]], "other_lexical_features module": [[16, null]], "politeness_features module": [[17, null]], "politeness_v2 module": [[18, null]], "politeness_v2_helper module": [[19, null]], "preload_word_lists module": [[70, null]], "preprocess module": [[71, null]], "question_num module": [[20, null]], "readability module": [[21, null]], "reddit_tags module": [[22, null]], "summarize_features module": [[72, null]], "temporal_features module": [[23, null]], "textblob_sentiment_analysis module": [[24, null]], "turn_taking_features module": [[25, null]], "variance_in_DD module": [[26, null]], "within_person_discursive_range module": [[27, null]], "word_mimicry module": [[28, null]], "z-scores:": [[41, "z-scores"]], "zscore_chats_and_conversation module": [[73, null]], "\u201cDriver\u201d Classes: Utterance-, Conversation-, and Speaker-Level Features": [[69, "driver-classes-utterance-conversation-and-speaker-level-features"]]}, "docnames": ["basics", "examples", "feature_builder", "features/basic_features", "features/burstiness", "features/certainty", "features/discursive_diversity", "features/fflow", "features/get_all_DD_features", "features/get_user_network", "features/hedge", "features/index", "features/info_exchange_zscore", "features/information_diversity", "features/lexical_features_v2", "features/named_entity_recognition_features", "features/other_lexical_features", "features/politeness_features", "features/politeness_v2", "features/politeness_v2_helper", "features/question_num", "features/readability", "features/reddit_tags", "features/temporal_features", "features/textblob_sentiment_analysis", "features/turn_taking_features", "features/variance_in_DD", "features/within_person_discursive_range", "features/word_mimicry", "features_conceptual/TEMPLATE", "features_conceptual/certainty", "features_conceptual/content_word_accommodation", "features_conceptual/conversational_repair", "features_conceptual/dale_chall_score", "features_conceptual/discursive_diversity", "features_conceptual/forward_flow", "features_conceptual/function_word_accommodation", "features_conceptual/gini_coefficient", "features_conceptual/hedge", "features_conceptual/index", "features_conceptual/information_diversity", "features_conceptual/information_exchange", "features_conceptual/liwc", "features_conceptual/message_length", "features_conceptual/message_quantity", "features_conceptual/mimicry_bert", "features_conceptual/moving_mimicry", "features_conceptual/named_entity_recognition", "features_conceptual/online_discussions_tags", "features_conceptual/politeness_receptiveness_markers", "features_conceptual/politeness_strategies", "features_conceptual/positivity_bert", "features_conceptual/positivity_z_score", "features_conceptual/proportion_of_first_person_pronouns", "features_conceptual/questions", "features_conceptual/team_burstiness", "features_conceptual/textblob_polarity", "features_conceptual/textblob_subjectivity", "features_conceptual/time_difference", "features_conceptual/turn_taking_index", "features_conceptual/word_ttr", "index", "intro", "utils/assign_chunk_nums", "utils/calculate_chat_level_features", "utils/calculate_conversation_level_features", "utils/calculate_user_level_features", "utils/check_embeddings", "utils/gini_coefficient", "utils/index", "utils/preload_word_lists", "utils/preprocess", "utils/summarize_features", "utils/zscore_chats_and_conversation"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["basics.rst", "examples.rst", "feature_builder.rst", "features/basic_features.rst", "features/burstiness.rst", "features/certainty.rst", "features/discursive_diversity.rst", "features/fflow.rst", "features/get_all_DD_features.rst", "features/get_user_network.rst", "features/hedge.rst", "features/index.rst", "features/info_exchange_zscore.rst", "features/information_diversity.rst", "features/lexical_features_v2.rst", "features/named_entity_recognition_features.rst", "features/other_lexical_features.rst", "features/politeness_features.rst", "features/politeness_v2.rst", "features/politeness_v2_helper.rst", "features/question_num.rst", "features/readability.rst", "features/reddit_tags.rst", "features/temporal_features.rst", "features/textblob_sentiment_analysis.rst", "features/turn_taking_features.rst", "features/variance_in_DD.rst", "features/within_person_discursive_range.rst", "features/word_mimicry.rst", "features_conceptual/TEMPLATE.rst", "features_conceptual/certainty.rst", "features_conceptual/content_word_accommodation.rst", "features_conceptual/conversational_repair.rst", "features_conceptual/dale_chall_score.rst", "features_conceptual/discursive_diversity.rst", "features_conceptual/forward_flow.rst", "features_conceptual/function_word_accommodation.rst", "features_conceptual/gini_coefficient.rst", "features_conceptual/hedge.rst", "features_conceptual/index.rst", "features_conceptual/information_diversity.rst", "features_conceptual/information_exchange.rst", "features_conceptual/liwc.rst", "features_conceptual/message_length.rst", "features_conceptual/message_quantity.rst", "features_conceptual/mimicry_bert.rst", "features_conceptual/moving_mimicry.rst", "features_conceptual/named_entity_recognition.rst", "features_conceptual/online_discussions_tags.rst", "features_conceptual/politeness_receptiveness_markers.rst", "features_conceptual/politeness_strategies.rst", "features_conceptual/positivity_bert.rst", "features_conceptual/positivity_z_score.rst", "features_conceptual/proportion_of_first_person_pronouns.rst", "features_conceptual/questions.rst", "features_conceptual/team_burstiness.rst", "features_conceptual/textblob_polarity.rst", "features_conceptual/textblob_subjectivity.rst", "features_conceptual/time_difference.rst", "features_conceptual/turn_taking_index.rst", "features_conceptual/word_ttr.rst", "index.rst", "intro.rst", "utils/assign_chunk_nums.rst", "utils/calculate_chat_level_features.rst", "utils/calculate_conversation_level_features.rst", "utils/calculate_user_level_features.rst", "utils/check_embeddings.rst", "utils/gini_coefficient.rst", "utils/index.rst", "utils/preload_word_lists.rst", "utils/preprocess.rst", "utils/summarize_features.rst", "utils/zscore_chats_and_conversation.rst"], "indexentries": {"adverb_limiter() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.adverb_limiter", false]], "assert_key_columns_present() (in module utils.preprocess)": [[71, "utils.preprocess.assert_key_columns_present", false]], "assign_chunk_nums() (in module utils.assign_chunk_nums)": [[63, "utils.assign_chunk_nums.assign_chunk_nums", false]], "bare_command() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.bare_command", false]], "built_spacy_ner() (in module features.named_entity_recognition_features)": [[15, "features.named_entity_recognition_features.built_spacy_ner", false]], "burstiness() (in module features.burstiness)": [[4, "features.burstiness.burstiness", false]], "calculate_chat_level_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_chat_level_features", false]], "calculate_conversation_level_features() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.calculate_conversation_level_features", false]], "calculate_hedge_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_hedge_features", false]], "calculate_id_score() (in module features.information_diversity)": [[13, "features.information_diversity.calculate_ID_score", false]], "calculate_info_diversity() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.calculate_info_diversity", false]], "calculate_named_entities() (in module features.named_entity_recognition_features)": [[15, "features.named_entity_recognition_features.calculate_named_entities", false]], "calculate_num_question_naive() (in module features.question_num)": [[20, "features.question_num.calculate_num_question_naive", false]], "calculate_politeness_sentiment() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_politeness_sentiment", false]], "calculate_politeness_v2() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_politeness_v2", false]], "calculate_team_burstiness() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.calculate_team_burstiness", false]], "calculate_textblob_sentiment() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_textblob_sentiment", false]], "calculate_user_level_features() (utils.calculate_user_level_features.userlevelfeaturescalculator method)": [[66, "utils.calculate_user_level_features.UserLevelFeaturesCalculator.calculate_user_level_features", false]], "calculate_vector_word_mimicry() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_vector_word_mimicry", false]], "calculate_word_mimicry() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_word_mimicry", false]], "chat_level_features() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.chat_level_features", false]], "chatlevelfeaturescalculator (class in utils.calculate_chat_level_features)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator", false]], "check_embeddings() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.check_embeddings", false]], "classify_ntri() (in module features.other_lexical_features)": [[16, "features.other_lexical_features.classify_NTRI", false]], "classify_text_dalechall() (in module features.readability)": [[21, "features.readability.classify_text_dalechall", false]], "clean_text() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.clean_text", false]], "coerce_to_date_or_number() (in module features.temporal_features)": [[23, "features.temporal_features.coerce_to_date_or_number", false]], "commit_data() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.commit_data", false]], "compress() (in module utils.preprocess)": [[71, "utils.preprocess.compress", false]], "compute_frequency() (in module features.word_mimicry)": [[28, "features.word_mimicry.compute_frequency", false]], "compute_frequency_per_conv() (in module features.word_mimicry)": [[28, "features.word_mimicry.compute_frequency_per_conv", false]], "computetf() (in module features.word_mimicry)": [[28, "features.word_mimicry.computeTF", false]], "concat_bert_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.concat_bert_features", false]], "conjection_seperator() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.conjection_seperator", false]], "content_mimicry_score() (in module features.word_mimicry)": [[28, "features.word_mimicry.Content_mimicry_score", false]], "content_mimicry_score_per_conv() (in module features.word_mimicry)": [[28, "features.word_mimicry.Content_mimicry_score_per_conv", false]], "conv_level_features() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.conv_level_features", false]], "conv_to_float_arr() (in module features.get_all_dd_features)": [[8, "features.get_all_DD_features.conv_to_float_arr", false]], "conversationlevelfeaturescalculator (class in utils.calculate_conversation_level_features)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator", false]], "count_all_caps() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_all_caps", false]], "count_bullet_points() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_bullet_points", false]], "count_characters() (in module features.basic_features)": [[3, "features.basic_features.count_characters", false]], "count_difficult_words() (in module features.readability)": [[21, "features.readability.count_difficult_words", false]], "count_ellipses() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_ellipses", false]], "count_emojis() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_emojis", false]], "count_emphasis() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_emphasis", false]], "count_line_breaks() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_line_breaks", false]], "count_links() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_links", false]], "count_matches() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.count_matches", false]], "count_messages() (in module features.basic_features)": [[3, "features.basic_features.count_messages", false]], "count_numbering() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_numbering", false]], "count_parentheses() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_parentheses", false]], "count_quotes() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_quotes", false]], "count_responding_to_someone() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_responding_to_someone", false]], "count_spacy_matches() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.count_spacy_matches", false]], "count_syllables() (in module features.readability)": [[21, "features.readability.count_syllables", false]], "count_turn_taking_index() (in module features.turn_taking_features)": [[25, "features.turn_taking_features.count_turn_taking_index", false]], "count_turns() (in module features.turn_taking_features)": [[25, "features.turn_taking_features.count_turns", false]], "count_user_references() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_user_references", false]], "count_words() (in module features.basic_features)": [[3, "features.basic_features.count_words", false]], "create_chunks() (in module utils.assign_chunk_nums)": [[63, "utils.assign_chunk_nums.create_chunks", false]], "create_chunks_messages() (in module utils.assign_chunk_nums)": [[63, "utils.assign_chunk_nums.create_chunks_messages", false]], "create_cumulative_rows() (in module utils.preprocess)": [[71, "utils.preprocess.create_cumulative_rows", false]], "dale_chall_helper() (in module features.readability)": [[21, "features.readability.dale_chall_helper", false]], "feat_counts() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.feat_counts", false]], "feature_builder": [[2, "module-feature_builder", false]], "featurebuilder (class in feature_builder)": [[2, "feature_builder.FeatureBuilder", false]], "features.basic_features": [[3, "module-features.basic_features", false]], "features.burstiness": [[4, "module-features.burstiness", false]], "features.certainty": [[5, "module-features.certainty", false]], "features.discursive_diversity": [[6, "module-features.discursive_diversity", false]], "features.fflow": [[7, "module-features.fflow", false]], "features.get_all_dd_features": [[8, "module-features.get_all_DD_features", false]], "features.get_user_network": [[9, "module-features.get_user_network", false]], "features.hedge": [[10, "module-features.hedge", false]], "features.info_exchange_zscore": [[12, "module-features.info_exchange_zscore", false]], "features.information_diversity": [[13, "module-features.information_diversity", false]], "features.lexical_features_v2": [[14, "module-features.lexical_features_v2", false]], "features.named_entity_recognition_features": [[15, "module-features.named_entity_recognition_features", false]], "features.other_lexical_features": [[16, "module-features.other_lexical_features", false]], "features.politeness_features": [[17, "module-features.politeness_features", false]], "features.politeness_v2": [[18, "module-features.politeness_v2", false]], "features.politeness_v2_helper": [[19, "module-features.politeness_v2_helper", false]], "features.question_num": [[20, "module-features.question_num", false]], "features.readability": [[21, "module-features.readability", false]], "features.reddit_tags": [[22, "module-features.reddit_tags", false]], "features.temporal_features": [[23, "module-features.temporal_features", false]], "features.textblob_sentiment_analysis": [[24, "module-features.textblob_sentiment_analysis", false]], "features.turn_taking_features": [[25, "module-features.turn_taking_features", false]], "features.variance_in_dd": [[26, "module-features.variance_in_DD", false]], "features.within_person_discursive_range": [[27, "module-features.within_person_discursive_range", false]], "features.word_mimicry": [[28, "module-features.word_mimicry", false]], "featurize() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.featurize", false]], "function_mimicry_score() (in module features.word_mimicry)": [[28, "features.word_mimicry.function_mimicry_score", false]], "generate_bert() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.generate_bert", false]], "generate_certainty_pkl() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.generate_certainty_pkl", false]], "generate_lexicon_pkl() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.generate_lexicon_pkl", false]], "generate_vect() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.generate_vect", false]], "get_centroids() (utils.calculate_user_level_features.userlevelfeaturescalculator method)": [[66, "utils.calculate_user_level_features.UserLevelFeaturesCalculator.get_centroids", false]], "get_certainty() (in module features.certainty)": [[5, "features.certainty.get_certainty", false]], "get_certainty_score() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.get_certainty_score", false]], "get_content_words_in_message() (in module features.word_mimicry)": [[28, "features.word_mimicry.get_content_words_in_message", false]], "get_conversation_level_aggregates() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.get_conversation_level_aggregates", false]], "get_cosine_similarity() (in module features.discursive_diversity)": [[6, "features.discursive_diversity.get_cosine_similarity", false]], "get_dale_chall_easy_words() (in module utils.preload_word_lists)": [[70, "utils.preload_word_lists.get_dale_chall_easy_words", false]], "get_dale_chall_score_and_classfication() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.get_dale_chall_score_and_classfication", false]], "get_dd() (in module features.discursive_diversity)": [[6, "features.discursive_diversity.get_DD", false]], "get_dd_features() (in module features.get_all_dd_features)": [[8, "features.get_all_DD_features.get_DD_features", false]], "get_dep_pairs() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.get_dep_pairs", false]], "get_dep_pairs_noneg() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.get_dep_pairs_noneg", false]], "get_discursive_diversity_features() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.get_discursive_diversity_features", false]], "get_first_pct_of_chat() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.get_first_pct_of_chat", false]], "get_first_person_words() (in module utils.preload_word_lists)": [[70, "utils.preload_word_lists.get_first_person_words", false]], "get_forward_flow() (in module features.fflow)": [[7, "features.fflow.get_forward_flow", false]], "get_forward_flow() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.get_forward_flow", false]], "get_function_words() (in module utils.preload_word_lists)": [[70, "utils.preload_word_lists.get_function_words", false]], "get_function_words_in_message() (in module features.word_mimicry)": [[28, "features.word_mimicry.get_function_words_in_message", false]], "get_gini() (in module utils.gini_coefficient)": [[68, "utils.gini_coefficient.get_gini", false]], "get_gini_features() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.get_gini_features", false]], "get_info_diversity() (in module features.information_diversity)": [[13, "features.information_diversity.get_info_diversity", false]], "get_info_exchange_wordcount() (in module features.info_exchange_zscore)": [[12, "features.info_exchange_zscore.get_info_exchange_wordcount", false]], "get_liwc_count() (in module features.lexical_features_v2)": [[14, "features.lexical_features_v2.get_liwc_count", false]], "get_max() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_max", false]], "get_mean() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_mean", false]], "get_median() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_median", false]], "get_mimicry_bert() (in module features.word_mimicry)": [[28, "features.word_mimicry.get_mimicry_bert", false]], "get_min() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_min", false]], "get_moving_mimicry() (in module features.word_mimicry)": [[28, "features.word_mimicry.get_moving_mimicry", false]], "get_named_entity() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.get_named_entity", false]], "get_nan_vector() (in module features.within_person_discursive_range)": [[27, "features.within_person_discursive_range.get_nan_vector", false]], "get_nan_vector() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.get_nan_vector", false]], "get_polarity_score() (in module features.textblob_sentiment_analysis)": [[24, "features.textblob_sentiment_analysis.get_polarity_score", false]], "get_politeness_strategies() (in module features.politeness_features)": [[17, "features.politeness_features.get_politeness_strategies", false]], "get_politeness_v2() (in module features.politeness_v2)": [[18, "features.politeness_v2.get_politeness_v2", false]], "get_proportion_first_pronouns() (in module features.other_lexical_features)": [[16, "features.other_lexical_features.get_proportion_first_pronouns", false]], "get_question_words() (in module utils.preload_word_lists)": [[70, "utils.preload_word_lists.get_question_words", false]], "get_reddit_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.get_reddit_features", false]], "get_sentiment() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.get_sentiment", false]], "get_stdev() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_stdev", false]], "get_subjectivity_score() (in module features.textblob_sentiment_analysis)": [[24, "features.textblob_sentiment_analysis.get_subjectivity_score", false]], "get_sum() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_sum", false]], "get_team_burstiness() (in module features.burstiness)": [[4, "features.burstiness.get_team_burstiness", false]], "get_temporal_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.get_temporal_features", false]], "get_time_diff() (in module features.temporal_features)": [[23, "features.temporal_features.get_time_diff", false]], "get_time_diff_startend() (in module features.temporal_features)": [[23, "features.temporal_features.get_time_diff_startend", false]], "get_turn() (in module features.turn_taking_features)": [[25, "features.turn_taking_features.get_turn", false]], "get_turn_id() (in module utils.preprocess)": [[71, "utils.preprocess.get_turn_id", false]], "get_turn_taking_features() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.get_turn_taking_features", false]], "get_unique_pairwise_combos() (in module features.discursive_diversity)": [[6, "features.discursive_diversity.get_unique_pairwise_combos", false]], "get_user_level_aggregates() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.get_user_level_aggregates", false]], "get_user_level_summary_statistics_features() (utils.calculate_user_level_features.userlevelfeaturescalculator method)": [[66, "utils.calculate_user_level_features.UserLevelFeaturesCalculator.get_user_level_summary_statistics_features", false]], "get_user_level_summed_features() (utils.calculate_user_level_features.userlevelfeaturescalculator method)": [[66, "utils.calculate_user_level_features.UserLevelFeaturesCalculator.get_user_level_summed_features", false]], "get_user_max_dataframe() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_user_max_dataframe", false]], "get_user_mean_dataframe() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_user_mean_dataframe", false]], "get_user_median_dataframe() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_user_median_dataframe", false]], "get_user_min_dataframe() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_user_min_dataframe", false]], "get_user_network() (in module features.get_user_network)": [[9, "features.get_user_network.get_user_network", false]], "get_user_network() (utils.calculate_user_level_features.userlevelfeaturescalculator method)": [[66, "utils.calculate_user_level_features.UserLevelFeaturesCalculator.get_user_network", false]], "get_user_stdev_dataframe() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_user_stdev_dataframe", false]], "get_user_sum_dataframe() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_user_sum_dataframe", false]], "get_variance_in_dd() (in module features.variance_in_dd)": [[26, "features.variance_in_DD.get_variance_in_DD", false]], "get_within_person_disc_range() (in module features.within_person_discursive_range)": [[27, "features.within_person_discursive_range.get_within_person_disc_range", false]], "get_word_ttr() (in module features.other_lexical_features)": [[16, "features.other_lexical_features.get_word_TTR", false]], "get_zscore_across_all_chats() (in module utils.zscore_chats_and_conversation)": [[73, "utils.zscore_chats_and_conversation.get_zscore_across_all_chats", false]], "get_zscore_across_all_conversations() (in module utils.zscore_chats_and_conversation)": [[73, "utils.zscore_chats_and_conversation.get_zscore_across_all_conversations", false]], "gini_coefficient() (in module utils.gini_coefficient)": [[68, "utils.gini_coefficient.gini_coefficient", false]], "info_diversity() (in module features.information_diversity)": [[13, "features.information_diversity.info_diversity", false]], "info_exchange() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.info_exchange", false]], "is_hedged_sentence_1() (in module features.hedge)": [[10, "features.hedge.is_hedged_sentence_1", false]], "lexical_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.lexical_features", false]], "liwc_features() (in module features.lexical_features_v2)": [[14, "features.lexical_features_v2.liwc_features", false]], "load_saved_data() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.load_saved_data", false]], "load_to_dict() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.load_to_dict", false]], "load_to_lists() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.load_to_lists", false]], "merge_conv_data_with_original() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.merge_conv_data_with_original", false]], "mimic_words() (in module features.word_mimicry)": [[28, "features.word_mimicry.mimic_words", false]], "module": [[2, "module-feature_builder", false], [3, "module-features.basic_features", false], [4, "module-features.burstiness", false], [5, "module-features.certainty", false], [6, "module-features.discursive_diversity", false], [7, "module-features.fflow", false], [8, "module-features.get_all_DD_features", false], [9, "module-features.get_user_network", false], [10, "module-features.hedge", false], [12, "module-features.info_exchange_zscore", false], [13, "module-features.information_diversity", false], [14, "module-features.lexical_features_v2", false], [15, "module-features.named_entity_recognition_features", false], [16, "module-features.other_lexical_features", false], [17, "module-features.politeness_features", false], [18, "module-features.politeness_v2", false], [19, "module-features.politeness_v2_helper", false], [20, "module-features.question_num", false], [21, "module-features.readability", false], [22, "module-features.reddit_tags", false], [23, "module-features.temporal_features", false], [24, "module-features.textblob_sentiment_analysis", false], [25, "module-features.turn_taking_features", false], [26, "module-features.variance_in_DD", false], [27, "module-features.within_person_discursive_range", false], [28, "module-features.word_mimicry", false], [63, "module-utils.assign_chunk_nums", false], [64, "module-utils.calculate_chat_level_features", false], [65, "module-utils.calculate_conversation_level_features", false], [66, "module-utils.calculate_user_level_features", false], [67, "module-utils.check_embeddings", false], [68, "module-utils.gini_coefficient", false], [70, "module-utils.preload_word_lists", false], [71, "module-utils.preprocess", false], [72, "module-utils.summarize_features", false], [73, "module-utils.zscore_chats_and_conversation", false]], "named_entities() (in module features.named_entity_recognition_features)": [[15, "features.named_entity_recognition_features.named_entities", false]], "num_named_entity() (in module features.named_entity_recognition_features)": [[15, "features.named_entity_recognition_features.num_named_entity", false]], "other_lexical_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.other_lexical_features", false]], "phrase_split() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.phrase_split", false]], "positivity_zscore() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.positivity_zscore", false]], "prep_simple() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.prep_simple", false]], "prep_whole() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.prep_whole", false]], "preprocess_chat_data() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.preprocess_chat_data", false]], "preprocess_conversation_columns() (in module utils.preprocess)": [[71, "utils.preprocess.preprocess_conversation_columns", false]], "preprocess_naive_turns() (in module utils.preprocess)": [[71, "utils.preprocess.preprocess_naive_turns", false]], "preprocess_text() (in module utils.preprocess)": [[71, "utils.preprocess.preprocess_text", false]], "preprocess_text_lowercase_but_retain_punctuation() (in module utils.preprocess)": [[71, "utils.preprocess.preprocess_text_lowercase_but_retain_punctuation", false]], "preprocessing() (in module features.information_diversity)": [[13, "features.information_diversity.preprocessing", false]], "punctuation_seperator() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.punctuation_seperator", false]], "question() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.Question", false]], "read_in_lexicons() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.read_in_lexicons", false]], "reduce_chunks() (in module utils.assign_chunk_nums)": [[63, "utils.assign_chunk_nums.reduce_chunks", false]], "remove_active_user() (in module features.get_user_network)": [[9, "features.get_user_network.remove_active_user", false]], "save_features() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.save_features", false]], "sentence_pad() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.sentence_pad", false]], "sentence_split() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.sentence_split", false]], "sentenciser() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.sentenciser", false]], "set_self_conv_data() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.set_self_conv_data", false]], "str_to_vec() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.str_to_vec", false]], "text_based_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.text_based_features", false]], "token_count() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.token_count", false]], "train_spacy_ner() (in module features.named_entity_recognition_features)": [[15, "features.named_entity_recognition_features.train_spacy_ner", false]], "user_level_features() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.user_level_features", false]], "userlevelfeaturescalculator (class in utils.calculate_user_level_features)": [[66, "utils.calculate_user_level_features.UserLevelFeaturesCalculator", false]], "utils.assign_chunk_nums": [[63, "module-utils.assign_chunk_nums", false]], "utils.calculate_chat_level_features": [[64, "module-utils.calculate_chat_level_features", false]], "utils.calculate_conversation_level_features": [[65, "module-utils.calculate_conversation_level_features", false]], "utils.calculate_user_level_features": [[66, "module-utils.calculate_user_level_features", false]], "utils.check_embeddings": [[67, "module-utils.check_embeddings", false]], "utils.gini_coefficient": [[68, "module-utils.gini_coefficient", false]], "utils.preload_word_lists": [[70, "module-utils.preload_word_lists", false]], "utils.preprocess": [[71, "module-utils.preprocess", false]], "utils.summarize_features": [[72, "module-utils.summarize_features", false]], "utils.zscore_chats_and_conversation": [[73, "module-utils.zscore_chats_and_conversation", false]], "word_start() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.word_start", false]]}, "objects": {"": [[2, 0, 0, "-", "feature_builder"]], "feature_builder": [[2, 1, 1, "", "FeatureBuilder"]], "feature_builder.FeatureBuilder": [[2, 2, 1, "", "chat_level_features"], [2, 2, 1, "", "conv_level_features"], [2, 2, 1, "", "featurize"], [2, 2, 1, "", "get_first_pct_of_chat"], [2, 2, 1, "", "merge_conv_data_with_original"], [2, 2, 1, "", "preprocess_chat_data"], [2, 2, 1, "", "save_features"], [2, 2, 1, "", "set_self_conv_data"], [2, 2, 1, "", "user_level_features"]], "features": [[3, 0, 0, "-", "basic_features"], [4, 0, 0, "-", "burstiness"], [5, 0, 0, "-", "certainty"], [6, 0, 0, "-", "discursive_diversity"], [7, 0, 0, "-", "fflow"], [8, 0, 0, "-", "get_all_DD_features"], [9, 0, 0, "-", "get_user_network"], [10, 0, 0, "-", "hedge"], [12, 0, 0, "-", "info_exchange_zscore"], [13, 0, 0, "-", "information_diversity"], [14, 0, 0, "-", "lexical_features_v2"], [15, 0, 0, "-", "named_entity_recognition_features"], [16, 0, 0, "-", "other_lexical_features"], [17, 0, 0, "-", "politeness_features"], [18, 0, 0, "-", "politeness_v2"], [19, 0, 0, "-", "politeness_v2_helper"], [20, 0, 0, "-", "question_num"], [21, 0, 0, "-", "readability"], [22, 0, 0, "-", "reddit_tags"], [23, 0, 0, "-", "temporal_features"], [24, 0, 0, "-", "textblob_sentiment_analysis"], [25, 0, 0, "-", "turn_taking_features"], [26, 0, 0, "-", "variance_in_DD"], [27, 0, 0, "-", "within_person_discursive_range"], [28, 0, 0, "-", "word_mimicry"]], "features.basic_features": [[3, 3, 1, "", "count_characters"], [3, 3, 1, "", "count_messages"], [3, 3, 1, "", "count_words"]], "features.burstiness": [[4, 3, 1, "", "burstiness"], [4, 3, 1, "", "get_team_burstiness"]], "features.certainty": [[5, 3, 1, "", "get_certainty"]], "features.discursive_diversity": [[6, 3, 1, "", "get_DD"], [6, 3, 1, "", "get_cosine_similarity"], [6, 3, 1, "", "get_unique_pairwise_combos"]], "features.fflow": [[7, 3, 1, "", "get_forward_flow"]], "features.get_all_DD_features": [[8, 3, 1, "", "conv_to_float_arr"], [8, 3, 1, "", "get_DD_features"]], "features.get_user_network": [[9, 3, 1, "", "get_user_network"], [9, 3, 1, "", "remove_active_user"]], "features.hedge": [[10, 3, 1, "", "is_hedged_sentence_1"]], "features.info_exchange_zscore": [[12, 3, 1, "", "get_info_exchange_wordcount"]], "features.information_diversity": [[13, 3, 1, "", "calculate_ID_score"], [13, 3, 1, "", "get_info_diversity"], [13, 3, 1, "", "info_diversity"], [13, 3, 1, "", "preprocessing"]], "features.lexical_features_v2": [[14, 3, 1, "", "get_liwc_count"], [14, 3, 1, "", "liwc_features"]], "features.named_entity_recognition_features": [[15, 3, 1, "", "built_spacy_ner"], [15, 3, 1, "", "calculate_named_entities"], [15, 3, 1, "", "named_entities"], [15, 3, 1, "", "num_named_entity"], [15, 3, 1, "", "train_spacy_ner"]], "features.other_lexical_features": [[16, 3, 1, "", "classify_NTRI"], [16, 3, 1, "", "get_proportion_first_pronouns"], [16, 3, 1, "", "get_word_TTR"]], "features.politeness_features": [[17, 3, 1, "", "get_politeness_strategies"]], "features.politeness_v2": [[18, 3, 1, "", "get_politeness_v2"]], "features.politeness_v2_helper": [[19, 3, 1, "", "Question"], [19, 3, 1, "", "adverb_limiter"], [19, 3, 1, "", "bare_command"], [19, 3, 1, "", "clean_text"], [19, 3, 1, "", "commit_data"], [19, 3, 1, "", "conjection_seperator"], [19, 3, 1, "", "count_matches"], [19, 3, 1, "", "count_spacy_matches"], [19, 3, 1, "", "feat_counts"], [19, 3, 1, "", "get_dep_pairs"], [19, 3, 1, "", "get_dep_pairs_noneg"], [19, 3, 1, "", "load_saved_data"], [19, 3, 1, "", "load_to_dict"], [19, 3, 1, "", "load_to_lists"], [19, 3, 1, "", "phrase_split"], [19, 3, 1, "", "prep_simple"], [19, 3, 1, "", "prep_whole"], [19, 3, 1, "", "punctuation_seperator"], [19, 3, 1, "", "sentence_pad"], [19, 3, 1, "", "sentence_split"], [19, 3, 1, "", "sentenciser"], [19, 3, 1, "", "token_count"], [19, 3, 1, "", "word_start"]], "features.question_num": [[20, 3, 1, "", "calculate_num_question_naive"]], "features.readability": [[21, 3, 1, "", "classify_text_dalechall"], [21, 3, 1, "", "count_difficult_words"], [21, 3, 1, "", "count_syllables"], [21, 3, 1, "", "dale_chall_helper"]], "features.reddit_tags": [[22, 3, 1, "", "count_all_caps"], [22, 3, 1, "", "count_bullet_points"], [22, 3, 1, "", "count_ellipses"], [22, 3, 1, "", "count_emojis"], [22, 3, 1, "", "count_emphasis"], [22, 3, 1, "", "count_line_breaks"], [22, 3, 1, "", "count_links"], [22, 3, 1, "", "count_numbering"], [22, 3, 1, "", "count_parentheses"], [22, 3, 1, "", "count_quotes"], [22, 3, 1, "", "count_responding_to_someone"], [22, 3, 1, "", "count_user_references"]], "features.temporal_features": [[23, 3, 1, "", "coerce_to_date_or_number"], [23, 3, 1, "", "get_time_diff"], [23, 3, 1, "", "get_time_diff_startend"]], "features.textblob_sentiment_analysis": [[24, 3, 1, "", "get_polarity_score"], [24, 3, 1, "", "get_subjectivity_score"]], "features.turn_taking_features": [[25, 3, 1, "", "count_turn_taking_index"], [25, 3, 1, "", "count_turns"], [25, 3, 1, "", "get_turn"]], "features.variance_in_DD": [[26, 3, 1, "", "get_variance_in_DD"]], "features.within_person_discursive_range": [[27, 3, 1, "", "get_nan_vector"], [27, 3, 1, "", "get_within_person_disc_range"]], "features.word_mimicry": [[28, 3, 1, "", "Content_mimicry_score"], [28, 3, 1, "", "Content_mimicry_score_per_conv"], [28, 3, 1, "", "computeTF"], [28, 3, 1, "", "compute_frequency"], [28, 3, 1, "", "compute_frequency_per_conv"], [28, 3, 1, "", "function_mimicry_score"], [28, 3, 1, "", "get_content_words_in_message"], [28, 3, 1, "", "get_function_words_in_message"], [28, 3, 1, "", "get_mimicry_bert"], [28, 3, 1, "", "get_moving_mimicry"], [28, 3, 1, "", "mimic_words"]], "utils": [[63, 0, 0, "-", "assign_chunk_nums"], [64, 0, 0, "-", "calculate_chat_level_features"], [65, 0, 0, "-", "calculate_conversation_level_features"], [66, 0, 0, "-", "calculate_user_level_features"], [67, 0, 0, "-", "check_embeddings"], [68, 0, 0, "-", "gini_coefficient"], [70, 0, 0, "-", "preload_word_lists"], [71, 0, 0, "-", "preprocess"], [72, 0, 0, "-", "summarize_features"], [73, 0, 0, "-", "zscore_chats_and_conversation"]], "utils.assign_chunk_nums": [[63, 3, 1, "", "assign_chunk_nums"], [63, 3, 1, "", "create_chunks"], [63, 3, 1, "", "create_chunks_messages"], [63, 3, 1, "", "reduce_chunks"]], "utils.calculate_chat_level_features": [[64, 1, 1, "", "ChatLevelFeaturesCalculator"]], "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator": [[64, 2, 1, "", "calculate_chat_level_features"], [64, 2, 1, "", "calculate_hedge_features"], [64, 2, 1, "", "calculate_politeness_sentiment"], [64, 2, 1, "", "calculate_politeness_v2"], [64, 2, 1, "", "calculate_textblob_sentiment"], [64, 2, 1, "", "calculate_vector_word_mimicry"], [64, 2, 1, "", "calculate_word_mimicry"], [64, 2, 1, "", "concat_bert_features"], [64, 2, 1, "", "get_certainty_score"], [64, 2, 1, "", "get_dale_chall_score_and_classfication"], [64, 2, 1, "", "get_forward_flow"], [64, 2, 1, "", "get_named_entity"], [64, 2, 1, "", "get_reddit_features"], [64, 2, 1, "", "get_temporal_features"], [64, 2, 1, "", "info_exchange"], [64, 2, 1, "", "lexical_features"], [64, 2, 1, "", "other_lexical_features"], [64, 2, 1, "", "positivity_zscore"], [64, 2, 1, "", "text_based_features"]], "utils.calculate_conversation_level_features": [[65, 1, 1, "", "ConversationLevelFeaturesCalculator"]], "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator": [[65, 2, 1, "", "calculate_conversation_level_features"], [65, 2, 1, "", "calculate_info_diversity"], [65, 2, 1, "", "calculate_team_burstiness"], [65, 2, 1, "", "get_conversation_level_aggregates"], [65, 2, 1, "", "get_discursive_diversity_features"], [65, 2, 1, "", "get_gini_features"], [65, 2, 1, "", "get_turn_taking_features"], [65, 2, 1, "", "get_user_level_aggregates"]], "utils.calculate_user_level_features": [[66, 1, 1, "", "UserLevelFeaturesCalculator"]], "utils.calculate_user_level_features.UserLevelFeaturesCalculator": [[66, 2, 1, "", "calculate_user_level_features"], [66, 2, 1, "", "get_centroids"], [66, 2, 1, "", "get_user_level_summary_statistics_features"], [66, 2, 1, "", "get_user_level_summed_features"], [66, 2, 1, "", "get_user_network"]], "utils.check_embeddings": [[67, 3, 1, "", "check_embeddings"], [67, 3, 1, "", "generate_bert"], [67, 3, 1, "", "generate_certainty_pkl"], [67, 3, 1, "", "generate_lexicon_pkl"], [67, 3, 1, "", "generate_vect"], [67, 3, 1, "", "get_nan_vector"], [67, 3, 1, "", "get_sentiment"], [67, 3, 1, "", "read_in_lexicons"], [67, 3, 1, "", "str_to_vec"]], "utils.gini_coefficient": [[68, 3, 1, "", "get_gini"], [68, 3, 1, "", "gini_coefficient"]], "utils.preload_word_lists": [[70, 3, 1, "", "get_dale_chall_easy_words"], [70, 3, 1, "", "get_first_person_words"], [70, 3, 1, "", "get_function_words"], [70, 3, 1, "", "get_question_words"]], "utils.preprocess": [[71, 3, 1, "", "assert_key_columns_present"], [71, 3, 1, "", "compress"], [71, 3, 1, "", "create_cumulative_rows"], [71, 3, 1, "", "get_turn_id"], [71, 3, 1, "", "preprocess_conversation_columns"], [71, 3, 1, "", "preprocess_naive_turns"], [71, 3, 1, "", "preprocess_text"], [71, 3, 1, "", "preprocess_text_lowercase_but_retain_punctuation"]], "utils.summarize_features": [[72, 3, 1, "", "get_max"], [72, 3, 1, "", "get_mean"], [72, 3, 1, "", "get_median"], [72, 3, 1, "", "get_min"], [72, 3, 1, "", "get_stdev"], [72, 3, 1, "", "get_sum"], [72, 3, 1, "", "get_user_max_dataframe"], [72, 3, 1, "", "get_user_mean_dataframe"], [72, 3, 1, "", "get_user_median_dataframe"], [72, 3, 1, "", "get_user_min_dataframe"], [72, 3, 1, "", "get_user_stdev_dataframe"], [72, 3, 1, "", "get_user_sum_dataframe"]], "utils.zscore_chats_and_conversation": [[73, 3, 1, "", "get_zscore_across_all_chats"], [73, 3, 1, "", "get_zscore_across_all_conversations"]]}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function"}, "terms": {"": [0, 1, 2, 4, 5, 9, 11, 13, 25, 28, 29, 31, 32, 34, 35, 36, 40, 41, 42, 43, 44, 45, 46, 47, 49, 50, 51, 52, 55, 59, 61, 62, 64, 65, 66], "0": [0, 1, 2, 5, 10, 13, 16, 21, 24, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 45, 46, 47, 50, 51, 53, 55, 59, 61], "000": 42, "00222437221134802": [5, 64], "01": 51, "02": 51, "04": 40, "0496": [21, 33], "05": [13, 40, 50, 51], "06": 51, "08": 50, "09": [45, 46, 50], "1": [0, 1, 2, 3, 10, 13, 22, 24, 32, 34, 35, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 51, 53, 55, 56, 57, 59, 61, 62], "10": [1, 5, 6, 21, 24, 33, 42, 59, 61, 64], "100": [1, 21, 33, 37, 42, 47, 62], "1000": 42, "10th": 33, "1145": [21, 24], "1177": [5, 64], "11th": 33, "12": [35, 45, 46, 50], "1287": 6, "12th": 33, "13": 50, "14": 50, "15": [1, 37, 50], "1579": [21, 33], "17": 50, "1948": 33, "195": 36, "1977": 62, "1d": 67, "1lpngokujsx": 5, "1st": 50, "1st_person": 50, "1st_person_pl": 50, "1st_person_start": 50, "2": [0, 1, 2, 34, 35, 41, 47, 59, 61, 62], "20": [37, 59], "2004": 42, "2007": [5, 42], "2009": 60, "2012": 55, "2013": [12, 16, 31, 32, 36, 37, 38, 41, 43, 49, 50, 52, 54, 70], "2015": [53, 58, 60], "2016": 4, "2017": 13, "2018": [40, 44, 55], "2019": [35, 52], "2020": [18, 21, 24, 33, 49, 50, 56, 57], "2021": [1, 6, 43, 44], "2022": [13, 34], "2023": [1, 5, 30, 59, 61, 64], "2024": 40, "21": 59, "22": [41, 50], "2384068": 4, "24": [1, 61], "25": 47, "27": 50, "28": 50, "29": 50, "2nd": 50, "2nd_person": 50, "2nd_person_start": 50, "3": [0, 1, 2, 21, 34, 41, 42, 51, 59, 61, 71], "30": 50, "3000": 33, "32": [34, 50], "3432929": [21, 24], "35": 51, "36": 50, "38": 50, "39": 49, "39512260": 68, "3n": 59, "4": [0, 1, 5, 13, 21, 30, 33, 41, 42, 56, 61, 62], "4274": 6, "43": 50, "45": 50, "47": 50, "49": 50, "4pit4bqz6": 5, "4th": [21, 33], "5": [1, 5, 21, 30, 33, 37, 41, 59], "50": [1, 47], "52": 50, "53": 50, "57": 50, "58": 50, "5th": 33, "6": [1, 33, 43], "60": 51, "63": 50, "6365": 21, "64": 67, "68": 47, "6th": 33, "7": [30, 33, 48], "70": 50, "78": [35, 50], "7th": 33, "8": [1, 30, 33], "80": [21, 70], "82": 41, "85": 34, "86": 35, "87": 50, "89": [45, 46], "8th": 33, "9": [2, 5, 21, 30, 33, 40, 47, 50], "9123": 47, "92": 51, "93chall_readability_formula": [21, 70], "94": 15, "95": 47, "97": 51, "9855072464": 47, "9992": 47, "99954": 47, "9th": 33, "A": [1, 2, 4, 12, 13, 14, 15, 16, 17, 18, 19, 21, 23, 25, 28, 33, 34, 35, 37, 38, 40, 41, 44, 45, 46, 47, 49, 50, 51, 52, 57, 59, 60, 61, 62, 66, 67, 68, 70, 71, 72, 73], "And": [1, 62], "As": [1, 31, 35, 36, 40, 45, 61], "Be": 1, "But": [1, 50, 62], "By": [0, 1, 11, 42, 50], "For": [0, 1, 31, 34, 37, 41, 42, 43, 47, 49, 54, 56, 59, 62, 65], "If": [0, 1, 2, 5, 21, 29, 30, 35, 45, 47, 50, 55, 61, 62, 63, 64, 65, 66, 67, 71], "In": [1, 21, 30, 31, 34, 35, 36, 37, 39, 41, 42, 45, 46, 47, 50, 55, 59, 61, 62], "It": [1, 2, 31, 32, 33, 36, 37, 41, 44, 45, 46, 50, 64, 65, 66, 67, 71], "NO": 37, "NOT": [1, 61], "No": [19, 50, 53], "Not": 41, "One": [1, 37, 61], "That": [29, 55], "The": [1, 2, 3, 4, 5, 7, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 28, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 58, 59, 60, 63, 64, 65, 66, 67, 68, 69, 71, 72, 73], "Then": [1, 55, 61], "There": [1, 11, 32, 61, 66], "These": [1, 11, 17, 32, 34, 42, 48, 52, 61, 62, 69], "To": [0, 1, 29, 31, 34, 37, 40, 55, 56, 57, 61, 62], "WITH": 21, "Will": 50, "_deviat": 55, "_preprocessed_": 0, "abil": [13, 29], "abl": [31, 36, 61], "abort": 1, "about": [1, 12, 29, 31, 36, 41, 47, 61, 62], "abov": [1, 21, 34, 61], "abstract_id": 4, "accept": [0, 1, 58, 61], "access": [0, 1, 11, 15, 61], "accommod": [28, 32, 39, 45, 46, 64], "accord": [21, 37, 59, 64, 70], "accordingli": 63, "account": [1, 29, 32, 42], "accus": 50, "achiev": [50, 62], "acknowledg": 49, "acm": [21, 24], "acommod": 36, "across": [1, 13, 28, 31, 34, 40, 41, 50, 62, 64, 73], "action": 59, "activ": [1, 9, 44, 55, 71], "actual": [41, 56], "ad": [61, 62, 71], "adapt": 59, "add": [0, 1, 2, 21, 51, 61], "addit": [2, 32, 34, 42, 63, 69], "addition": [0, 30, 31, 32, 54], "address": 1, "adjac": 71, "adjust": [0, 21, 37, 63], "advanc": [31, 36], "advantag": 4, "adverb": [19, 31, 36], "adverb_limit": [19, 49], "affect": [0, 1, 29, 35, 44], "affirm": 49, "after": [0, 1, 31, 34, 36, 43, 61, 62, 64], "again": [32, 34], "against": [28, 31, 36, 52], "agarw": 62, "aggreg": [0, 2, 3, 37, 44, 61, 62, 65, 66, 72], "agre": 47, "agreement": 49, "ah": [31, 36], "ai": 62, "aim": [39, 62], "airtim": [37, 62], "al": [1, 5, 16, 18, 21, 24, 30, 31, 32, 33, 34, 35, 36, 38, 42, 43, 44, 49, 50, 52, 53, 54, 56, 57, 58, 59, 60, 64], "algorithm": [56, 57], "align": [35, 51], "all": [0, 1, 2, 6, 11, 12, 13, 15, 19, 22, 28, 30, 31, 34, 35, 36, 37, 40, 41, 42, 46, 48, 49, 51, 52, 55, 58, 61, 62, 64, 65, 66, 71, 73], "allow": 1, "almaatouq": 59, "along": 1, "alongsid": 1, "alphabet": 49, "alphanumer": 71, "alreadi": [0, 1, 2, 4, 10, 12, 16, 67], "also": [0, 1, 2, 28, 30, 31, 32, 34, 36, 37, 38, 42, 47, 51, 54, 60, 61, 62, 64, 65, 67, 69, 71], "alsobai": 59, "altern": 59, "although": [1, 23, 31, 36], "alwai": [1, 55], "am": [31, 36, 42, 54, 62], "amaz": [48, 56], "ambient": 32, "american": 33, "ami": [47, 59, 62], "amic": 62, "among": [36, 37, 52, 55, 62], "amongst": [6, 35, 48], "an": [0, 1, 2, 5, 8, 11, 12, 13, 21, 29, 30, 31, 32, 33, 34, 36, 38, 40, 41, 42, 45, 47, 48, 49, 50, 51, 52, 54, 59, 60, 61, 62, 63, 65, 66, 67, 68], "analys": [1, 62], "analysi": [0, 1, 11, 52, 62, 67, 71], "analyt": 62, "analyz": [0, 2, 13, 14, 16, 17, 19, 20, 21, 22, 24, 28, 43, 52, 62, 67, 71], "analyze_first_pct": [0, 1, 2], "angri": 47, "ani": [0, 1, 29, 31, 33, 38, 54, 62, 71], "annot": [17, 50], "anoth": [30, 34, 36, 48], "answer": 29, "anybodi": [31, 36], "anyth": [1, 23, 31, 36, 56], "anywher": [31, 36], "apartment": 42, "api": [2, 47], "api_refer": 24, "apolog": [17, 50], "apologi": 49, "appear": [0, 15, 28, 31, 37, 38, 42, 64], "append": [1, 17, 64, 65, 66, 67], "appli": [4, 13, 18, 62, 64, 69], "applic": [29, 71], "appreci": 50, "approach": [32, 38, 42, 45, 46, 49, 53, 64], "appropri": [1, 31, 69], "ar": [0, 1, 2, 3, 5, 9, 10, 11, 15, 17, 19, 21, 23, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 51, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 69, 71], "arcross": 34, "area": 62, "aren": [31, 36], "around": 2, "arous": 48, "arrai": [6, 8, 67, 68], "articl": [37, 50], "ask": [20, 47, 54], "ask_ag": 49, "aspect": [50, 62], "assert_key_columns_pres": 71, "assign": [1, 31, 36, 38, 45, 46, 52, 59, 61, 63, 71], "assign_chunk_num": 69, "associ": [1, 4, 15, 21, 29, 30, 31, 32, 36, 40, 45, 46, 47, 48, 61], "assum": [0, 1, 2, 10, 12, 16, 23, 31, 41, 60, 61, 71], "assumpt": [1, 41, 61], "asterisk": 22, "attribut": [0, 1, 11, 34, 51, 52, 56, 62], "author": [5, 31, 36, 59], "auto": 2, "automat": [0, 1, 61, 69], "auxiliari": [31, 36], "avail": [0, 1, 61, 62, 63, 64, 67], "averag": [1, 11, 13, 28, 30, 33, 34, 35, 40, 41, 46, 52, 64, 65, 72], "avil": 62, "avoid": 30, "awar": 29, "awesom": 62, "b": [4, 34, 35, 45, 46, 55, 62], "back": 62, "bag": [32, 38, 42, 45, 46, 49, 53, 56, 57], "bare_command": [19, 49], "base": [0, 1, 2, 15, 18, 19, 31, 32, 34, 35, 36, 37, 40, 42, 51, 52, 53, 54, 55, 56, 57, 61, 62, 63, 64, 65, 66, 71], "basic": [10, 11, 12, 16, 61, 62], "basic_featur": 11, "batch": 67, "batch_num": 1, "batch_siz": 67, "bay": [56, 57], "bbevi": 18, "becaus": [1, 2, 12, 21, 31, 36, 40, 56, 61], "becom": [44, 61, 62], "been": [1, 12, 16, 31, 36, 61], "befor": [0, 1, 2, 17, 31, 36, 45, 48], "beforehand": 64, "begin": [34, 54, 58, 61, 62, 63], "behavior": [0, 1, 11, 62, 63], "being": [4, 13, 14, 16, 17, 20, 21, 24, 31, 32, 36, 43, 47, 51, 55, 56, 60], "belong": [1, 42], "below": [1, 11, 21, 33, 36, 45, 48, 51, 61, 62, 69], "ber": 54, "bert": [0, 1, 31, 35, 36, 39, 46, 61, 64, 67], "bert_path": 67, "bert_sentiment_data": [1, 61, 64], "best": 29, "better": [31, 61], "between": [4, 6, 13, 21, 23, 24, 28, 30, 31, 34, 35, 36, 37, 40, 45, 46, 55, 58, 59, 62, 64, 65], "betwen": 34, "beyond": 2, "big": 59, "binari": [10, 32, 38], "blame": 47, "blob": [1, 24, 61], "block": [22, 32, 48, 59], "blog": 15, "bold": [22, 64], "bool": [2, 63, 65, 66, 67, 71], "boolean": 1, "bootstrap": 62, "both": [0, 1, 2, 42, 52, 54, 55, 59, 62], "bother": 50, "bottom": 59, "bought": 41, "bound": [29, 35, 36, 37, 42, 52, 55], "boundari": [34, 35], "break": [22, 48, 64], "brief": 44, "broader": 52, "broken": 59, "btw": 50, "bug": [1, 61], "build": [1, 7, 34, 45, 46, 62], "built": [1, 11], "built_spacy_n": 15, "bullet": [22, 48, 64], "bunch": 59, "burst": 58, "bursti": [1, 11, 39, 58, 61, 65], "by_the_wai": 49, "c": [12, 34, 35, 45, 46, 62], "cach": [0, 2, 51, 61], "calcul": [1, 2, 5, 11, 12, 16, 18, 21, 28, 33, 41, 48, 49, 50, 56, 57, 58, 60, 62, 63, 64, 65, 66, 67, 68, 72, 73], "calculate_chat_level_featur": [1, 61, 69], "calculate_conversation_level_featur": 69, "calculate_hedge_featur": 64, "calculate_id_scor": 13, "calculate_info_divers": 65, "calculate_named_ent": 15, "calculate_num_question_na": 20, "calculate_politeness_senti": 64, "calculate_politeness_v2": 64, "calculate_team_bursti": 65, "calculate_textblob_senti": 64, "calculate_user_level_featur": 69, "calculate_vector_word_mimicri": 64, "calculate_word_mimicri": 64, "call": [1, 2, 8, 11, 13, 61, 62, 64, 69], "can": [0, 1, 2, 11, 23, 31, 32, 33, 34, 36, 37, 42, 43, 44, 47, 48, 49, 50, 52, 54, 60, 61, 62, 69], "can_you": 49, "cannot": [1, 31, 36, 45, 46, 49, 62], "cao": [21, 24, 33, 43, 44, 56, 57, 62], "cap": [22, 48, 64], "capit": [0, 2, 48], "captur": [29, 30, 32, 34, 35, 38, 41, 42, 55], "caract": 40, "cardiffnlp": [1, 61], "care": 1, "carefulli": 60, "carri": 31, "casa_token": 5, "case": [1, 13, 16, 28, 29, 30, 31, 36, 37, 41, 45, 46, 51, 55, 56, 59, 61], "casual": 43, "categori": [21, 32, 45, 46, 49, 52], "caus": [31, 32, 36, 59], "center": 62, "central": 34, "centroid": [34, 66], "certain": [5, 19, 30, 42, 45, 46, 49], "certainli": 42, "certainti": [11, 38, 39, 42, 64, 67], "cfm": 4, "chall": [1, 21, 39, 64, 70], "chang": [0, 1, 34, 50, 61, 71], "charact": [1, 2, 3, 15, 19, 37, 49, 62, 64, 65, 66, 71], "characterist": [1, 62], "chat": [0, 1, 2, 4, 5, 6, 7, 8, 12, 13, 14, 16, 23, 25, 28, 29, 32, 35, 36, 41, 44, 45, 46, 49, 59, 61, 63, 64, 65, 66, 67, 68, 69, 71, 72, 73], "chat_data": [2, 6, 7, 8, 26, 27, 28, 63, 64, 65, 66, 67, 71], "chat_df": 14, "chat_featur": [1, 61, 65, 66], "chat_level_data": 72, "chat_level_featur": 2, "chatlevelfeaturescalcul": [1, 2, 17, 21, 61, 64, 69], "chats_data": 73, "check": [19, 23, 44, 64, 67, 71], "check_embed": [1, 61, 69], "chen": 62, "choic": 1, "choos": [1, 60], "chose": 1, "chronolog": 1, "chunk": [34, 59, 63], "chunk_num": 63, "circlelyt": 13, "citat": [21, 24], "cite": 50, "clarif": [16, 32, 64], "class": [1, 2, 31, 61, 62, 64, 65, 66], "classif": [21, 64], "classifi": [16, 21, 50, 56, 57], "classify_ntri": 16, "classify_text_dalechal": 21, "clean": [2, 17, 19, 67], "clean_text": 19, "clear": 1, "close": [31, 48, 62], "closer": [45, 46, 59], "clue": 62, "cmu": 12, "code": [6, 18, 29, 32, 51, 55, 61, 62, 68], "coeffici": [1, 4, 39, 62, 65, 68], "coerce_to_date_or_numb": 23, "cognit": 62, "col": 2, "colab": [0, 1], "collabor": [59, 62], "collaps": 2, "collect": [1, 2, 34, 49, 50, 52, 61, 62], "colleg": 33, "column": [0, 2, 4, 6, 7, 8, 9, 12, 13, 14, 16, 18, 23, 25, 28, 51, 56, 62, 63, 64, 65, 66, 67, 68, 71, 72, 73], "column_count_frequ": 28, "column_count_mim": 28, "column_mimc": 28, "column_nam": 71, "column_to_summar": 72, "com": [1, 2, 4, 5, 13, 15, 18, 64, 68, 71], "comb": 62, "combin": [0, 1, 6, 28, 64, 71], "come": [1, 12, 13, 21, 32, 33, 58, 61], "comm": [1, 61], "command": [1, 61], "comment": 48, "commit": 23, "commit_data": 19, "common": [0, 32, 62, 64], "commonli": 37, "commun": [0, 1, 11, 44, 48, 55, 60, 62, 64], "companion": 1, "compar": [31, 35, 44, 45, 52, 64, 71, 73], "compat": [0, 1, 61], "complement": [31, 36], "complet": [1, 2, 31, 55], "complex": [0, 35, 43, 50, 62], "compon": 50, "comprehens": [33, 48], "compress": 71, "comput": [0, 2, 4, 5, 6, 10, 11, 12, 13, 14, 28, 29, 30, 31, 34, 35, 36, 37, 40, 41, 42, 45, 46, 49, 50, 52, 55, 62, 64, 65, 66, 69, 73], "compute_frequ": 28, "compute_frequency_per_conv": 28, "compute_vectors_from_preprocess": [0, 2], "computetf": 28, "conain": 61, "concat_bert_featur": [1, 61, 64], "concaten": [19, 49, 64, 71], "concentr": 55, "concept": [29, 39, 42, 62], "conceptu": [61, 62], "concis": 43, "concret": 29, "conduct": 1, "confid": [2, 5, 15, 30, 47, 64], "conflict": 62, "confound": 44, "congruent": 34, "conjection_seper": 19, "conjunct": [19, 31, 36, 49], "conjunction_start": 49, "connect": 39, "conscious": 35, "consecut": 22, "consequ": [0, 1], "consid": [1, 2, 33, 37], "consider": [61, 62], "consist": [31, 36, 40, 41], "constitut": 41, "constrain": [34, 35], "construct": [1, 11, 55, 62], "constructor": 47, "consult": 5, "contain": [1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 23, 25, 28, 29, 30, 35, 38, 42, 47, 49, 50, 55, 61, 62, 63, 64, 67, 71, 72, 73], "content": [0, 1, 12, 13, 28, 34, 36, 39, 41, 42, 45, 46, 62, 64, 67], "content_mimicry_scor": 28, "content_mimicry_score_per_conv": 28, "content_word_accommod": 31, "content_word_accommodation_per_conv": 31, "content_word_mimicri": 28, "context": [2, 32, 42, 48, 62, 71], "continu": [56, 57], "contract": 49, "contrast": 39, "contribut": [13, 34, 37, 62], "control": 1, "conv": [1, 61], "conv_data": [2, 65], "conv_features_al": [1, 61], "conv_features_bas": [1, 11, 61], "conv_level_featur": 2, "conv_to_float_arr": 8, "convei": [6, 34, 52], "conveni": [1, 61], "convers": [0, 2, 3, 4, 6, 7, 8, 9, 12, 13, 23, 25, 28, 29, 31, 34, 35, 36, 37, 40, 41, 43, 44, 45, 46, 47, 48, 49, 50, 52, 55, 58, 59, 61, 63, 64, 65, 66, 68, 71, 72, 73], "conversation_id": [2, 28, 61, 71], "conversation_id_col": [0, 1, 2, 4, 6, 7, 8, 9, 13, 23, 25, 26, 27, 61, 63, 64, 65, 66, 68, 72, 73], "conversation_num": [0, 1, 2, 6, 7, 66, 71, 73], "conversationlevelfeaturescalcul": [2, 65, 69], "convert": [8, 41, 49, 71], "convict": 5, "convo_aggreg": [0, 1, 2, 65], "convo_column": [0, 1, 2, 65], "convo_method": [0, 1, 2, 65], "convokit": [17, 50, 62, 64], "coordin": 55, "copi": [0, 1], "copular": [31, 36], "core": [34, 64, 69], "cornel": 17, "corpu": [0, 1, 50], "corrado": 37, "correl": [41, 55], "correspond": [30, 34, 35, 40, 49, 55, 66], "cosin": [6, 7, 13, 28, 31, 34, 35, 36, 40, 45, 46, 65], "could": [1, 31, 33, 36, 50, 54], "could_you": 49, "couldn": [31, 36], "count": [1, 3, 12, 14, 15, 16, 19, 21, 25, 28, 30, 31, 32, 36, 39, 41, 43, 44, 49, 52, 53, 54, 56, 58, 64, 65, 66], "count_all_cap": 22, "count_bullet_point": 22, "count_charact": 3, "count_difficult_word": 21, "count_ellips": 22, "count_emoji": 22, "count_emphasi": 22, "count_line_break": 22, "count_link": 22, "count_match": [19, 49], "count_messag": 3, "count_numb": 22, "count_parenthes": 22, "count_quot": 22, "count_responding_to_someon": 22, "count_spacy_match": 19, "count_syl": 21, "count_turn": 25, "count_turn_taking_index": 25, "count_user_refer": 22, "count_word": 3, "countabl": [1, 65], "countd": 36, "counterfactu": 50, "cours": [16, 31, 34, 36, 63], "cover": 28, "creat": [0, 1, 2, 13, 19, 31, 40, 42, 61, 62, 64, 65, 66, 71], "create_chunk": 63, "create_chunks_messag": 63, "create_cumulative_row": 71, "credit": 33, "crowd": 13, "csv": [1, 2, 61, 62, 67], "cumul": [2, 71], "cumulative_group": [0, 1, 2, 71], "current": [1, 11, 23, 31, 34, 35, 36, 40, 45, 46, 58, 61, 64, 71], "curt": 43, "custom": [0, 11, 62], "custom_featur": [0, 1, 2, 61], "customiz": 62, "cut": 1, "cutoff": [2, 15, 47, 64], "d": [0, 1, 2, 31, 34, 36, 61], "dale": [1, 21, 39, 64, 70], "dale_chall_help": 21, "danescu": [49, 50], "dash": 22, "data": [0, 2, 6, 7, 8, 9, 13, 19, 20, 32, 37, 40, 41, 47, 51, 55, 61, 62, 63, 64, 65, 66, 67, 68, 71, 72, 73], "datafram": [0, 1, 2, 4, 6, 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 19, 23, 25, 28, 37, 47, 49, 59, 61, 62, 63, 64, 65, 66, 67, 68, 71, 72, 73], "dataknowsal": 15, "dataset": [1, 2, 9, 12, 13, 28, 31, 41, 47, 52, 61, 64, 65, 66, 73], "date": [1, 61], "datetim": [23, 58], "dcosta": 62, "deal": [50, 59], "death": 1, "debat": 59, "decid": 62, "decis": [1, 13, 62], "declar": [1, 62, 69], "deepli": 62, "default": [0, 1, 2, 5, 11, 13, 16, 23, 30, 34, 35, 42, 47, 62, 63, 66, 67, 71, 73], "defer": [17, 50], "defin": [0, 11, 21, 31, 34, 36, 40, 59, 62, 64, 65, 66, 70], "definit": [1, 3, 44], "degre": [6, 30, 36, 45, 46, 55], "delet": 29, "deliber": 1, "demo": 61, "democrat": 1, "demystifi": 62, "denomin": 59, "densiti": 60, "dep_": 49, "dep_pair": 19, "depend": [0, 1, 10, 19, 32, 49, 52, 61, 63], "deriv": [2, 11, 65, 66], "describ": [1, 11, 62], "descript": [1, 61], "design": [0, 1, 2, 13, 34, 62], "desir": [2, 63, 72], "detail": [0, 11, 33, 41, 43, 61, 62], "detect": [1, 32, 37, 38, 47, 48, 49, 54], "determin": [13, 18, 31, 35, 36, 40, 45, 46, 71], "dev": 24, "develop": [5, 37, 40, 62], "deviat": [4, 5, 29, 40, 41, 55, 58, 65, 72, 73], "df": [4, 8, 9, 12, 13, 16, 18, 23, 28, 63, 71], "dict": [17, 19, 28, 67], "dictionari": [1, 15, 17, 19, 28, 30, 42, 49, 61, 67], "did": [1, 31, 36, 37, 47, 50, 54, 62], "didn": [31, 36], "differ": [0, 1, 2, 4, 11, 12, 23, 28, 29, 31, 34, 36, 37, 39, 40, 44, 45, 46, 47, 49, 55, 62, 63, 64, 65, 66, 71], "differenti": [49, 59], "difficult": [21, 33], "difficult_word": 21, "difficulti": 33, "dimens": [40, 62], "dimension": [34, 35], "dinner": 41, "direct": [34, 43, 45, 47, 50, 69], "direct_quest": [32, 50, 54], "direct_start": 50, "directli": [1, 62, 69], "directori": [0, 2, 19, 61, 65, 67], "disabl": 1, "disagr": 49, "disagre": 51, "discours": [31, 36], "discret": [31, 36, 45, 46], "discurs": [0, 1, 6, 8, 39, 40, 61, 65, 66], "discursive_divers": 11, "discus": 8, "discuss": [0, 1, 31, 34, 39, 40, 42, 43, 61, 62, 71], "dispers": 68, "displai": [1, 34, 42, 46, 61], "dispos": 1, "distanc": [34, 35, 40], "distinct": [31, 36, 59], "distinguish": 59, "distribut": 31, "div": 16, "diverg": [6, 34, 35], "divers": [0, 1, 6, 8, 13, 39, 61, 65], "divid": [16, 34, 59, 63], "dl": [21, 24], "do": [0, 1, 29, 31, 34, 36, 37, 43, 49, 50, 54, 62, 69], "doc": [2, 19], "doc_top": 13, "document": [1, 17, 61, 69], "doe": [1, 2, 29, 40, 42, 43, 45, 47, 54, 61, 71], "doesn": [0, 1, 29, 31, 36, 45, 61], "doi": [5, 6, 21, 24, 64], "domain": [31, 50], "don": [31, 36, 49, 54, 62, 67], "done": [2, 50], "dot": 22, "doubl": 30, "down": [31, 36], "download": [1, 61], "download_resourc": [1, 61], "downstream": [17, 62], "dozen": 62, "drive": [62, 69], "driver": [2, 61, 64, 65, 66], "drop": [0, 2, 64], "due": [34, 59], "duncan": 62, "duplic": [1, 2, 71], "durat": [58, 63], "dure": [2, 55, 59, 62], "dynam": [59, 61], "e": [0, 1, 2, 4, 15, 20, 29, 30, 31, 32, 34, 35, 36, 37, 38, 41, 42, 47, 48, 49, 52, 54, 56, 59, 61, 63, 65, 66, 71], "e2": [21, 70], "each": [0, 1, 2, 3, 4, 7, 8, 9, 11, 12, 15, 17, 19, 23, 25, 28, 30, 31, 34, 35, 36, 37, 40, 41, 42, 44, 45, 46, 47, 49, 50, 51, 52, 55, 59, 61, 62, 63, 64, 65, 66, 67, 68, 69, 71, 72, 73], "earlier": [0, 1, 2, 42], "easi": [1, 21, 62, 70], "easier": 21, "easili": 33, "easy_word": 21, "eat": 34, "echo": 31, "econom": 37, "edg": [29, 59], "edu": [1, 12, 16, 17, 70], "effect": [1, 41], "effici": 1, "effort": 55, "either": [20, 23, 52, 55], "elaps": [23, 58], "element": [1, 6], "ellips": [22, 48, 64], "els": [1, 22, 47, 64], "embed": [8, 31, 34, 35, 36, 45, 46, 65, 66, 67, 69], "emili": [30, 35, 45, 46, 47, 59, 62], "emoji": [22, 48, 64], "emot": [1, 61], "emoticon": 48, "emphas": [22, 48, 64], "emphasi": 48, "empirica": [1, 2, 71], "emploi": 45, "empti": [0, 2, 13, 67], "en": [1, 21, 24, 61, 70], "en_core_web_sm": [1, 61], "enabl": 71, "enclos": 22, "encod": [1, 8], "encompass": 62, "encount": [1, 34, 35, 61], "encourag": 64, "end": [0, 1, 15, 20, 23, 34, 54, 62, 63], "engag": 43, "engin": 2, "english": [34, 42], "enjoi": 62, "ensur": [0, 1, 40, 49, 61, 63, 67, 71], "entir": [0, 1, 12, 28, 31, 36, 40, 41, 52, 59, 62, 73], "entiti": [0, 2, 15, 39, 64], "entityrecogn": 47, "entri": [1, 28, 61], "ep8dauru1ogvjurwdbof5h6ayfbslvughjyiv31d_as6ppbt": 5, "equal": [1, 21, 23, 34, 37, 40, 55, 59, 61, 62, 63], "equival": [0, 1, 41, 55, 61], "eric": 62, "error": [1, 16, 61], "especi": [41, 62], "essenti": [51, 71], "establish": 31, "estim": 31, "et": [1, 5, 16, 18, 21, 24, 30, 31, 32, 33, 34, 35, 36, 38, 42, 43, 44, 49, 50, 52, 53, 54, 56, 57, 58, 59, 60, 64], "etc": [10, 15, 16, 17, 42], "evalu": [5, 47, 50], "evan": 62, "even": [0, 1, 2, 34, 37, 42, 62, 63, 67], "evenli": [34, 55], "event": [1, 34, 55, 61], "ever": 62, "everi": [1, 4, 13, 31, 34, 35, 36, 61, 62], "everybodi": [31, 36], "everyon": [31, 36, 47, 62], "everyth": [31, 36, 56], "everywher": [31, 36], "evolut": 35, "evolv": [35, 71], "exactli": [1, 2, 71], "examin": [40, 62, 63], "exampl": [0, 10, 11, 15, 21, 24, 29, 31, 32, 34, 37, 42, 43, 48, 50, 51, 54, 56, 59, 60, 61, 62], "example_data": 1, "exce": 15, "exchang": [12, 35, 39, 40, 45, 55, 64], "exclud": [0, 41, 42], "exclus": [41, 42], "excus": 32, "exhibit": 35, "exist": [0, 1, 2, 55, 61, 62, 63, 64, 67], "expand": 49, "expect": [1, 37, 47], "expected_valu": 47, "explain": [0, 29], "explan": [29, 43], "explor": [61, 62], "express": [5, 14, 30, 31, 32, 36, 38, 42, 64], "extend": 1, "extens": [43, 44], "extent": [1, 4, 7, 12, 31, 34, 35, 37, 51, 55, 59, 61], "extern": 48, "extra": 51, "extract": [1, 17, 19, 28, 40, 50, 64], "extrem": [55, 56, 57], "face": [1, 51, 61], "facilit": [62, 71], "fact": [4, 35, 50, 54, 59], "factual": [17, 24, 50], "fail": [1, 61], "fals": [0, 1, 2, 31, 54, 61, 71], "famili": 42, "far": [34, 35, 46, 50, 62], "faster": 14, "feat_count": 19, "featur": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 63, 64, 65, 66, 67], "feature_build": [0, 1, 61], "feature_dict": [1, 61], "feature_method": [64, 65], "feature_nam": [1, 61], "featurebuild": [0, 2, 11, 47, 69], "features_conceptu": [1, 61], "feauturebuild": 1, "few": [48, 62], "fewer": [12, 60], "fflow": 11, "field": [13, 17], "file": [0, 2, 12, 14, 19, 61, 65, 67], "filenam": [1, 2, 19], "filenotfounderror": 67, "fill": 71, "filler": [37, 60], "filler_paus": 49, "filter": [19, 62], "final": [1, 2, 34, 42, 62], "find": [1, 19, 28, 50], "fingertip": 62, "finit": 55, "first": [0, 2, 11, 12, 16, 19, 31, 34, 35, 36, 39, 40, 41, 42, 45, 46, 49, 52, 54, 59, 61, 62, 64, 70, 71], "first_person": 12, "first_person_plur": 49, "first_person_raw": [12, 16], "first_person_singl": 49, "five": 37, "fix": 52, "flag": 71, "float": [0, 2, 4, 5, 6, 8, 10, 13, 14, 16, 21, 24, 25, 28, 68], "floor": 59, "flow": [0, 1, 7, 31, 36, 39, 41, 45, 46, 61, 64], "focal": [31, 36], "focu": 41, "folder": [0, 1, 19], "follow": [0, 1, 2, 11, 16, 17, 29, 31, 32, 33, 41, 42, 47, 49, 50, 53, 55, 59, 60, 61, 64, 65], "for_m": 49, "for_you": 49, "forc": [0, 1, 61], "form": 1, "formal": [1, 61], "formal_titl": 49, "format": [0, 1, 8, 17, 22, 47, 48, 61, 62, 64], "former": [45, 46], "formula": [33, 42, 59, 64, 70], "fornt": 1, "forward": [0, 1, 7, 39, 41, 61, 64], "forward_flow": 35, "found": [1, 2, 5, 28, 30, 33, 61, 69], "four": [1, 8], "fourth": 33, "frac": 55, "fraction": 59, "frame": 64, "framework": [49, 50, 62], "frequenc": [28, 31, 44, 64], "frequency_dict": 28, "fridai": 34, "from": [0, 1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 19, 21, 28, 29, 31, 32, 33, 34, 35, 36, 39, 41, 42, 49, 50, 51, 53, 55, 56, 57, 58, 61, 62, 64, 65, 66, 67, 71], "full": [1, 28, 37], "full_empirical_dataset": 1, "fulli": [32, 48], "functinon": 12, "function": [1, 2, 3, 4, 10, 11, 12, 13, 14, 16, 20, 21, 23, 28, 31, 39, 44, 45, 46, 50, 56, 57, 60, 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73], "function_mimic_word": 28, "function_mimicry_scor": 28, "function_word_mimicri": 28, "function_word_refer": 28, "fund": 62, "further": [1, 61, 71], "futur": [23, 66], "g": [0, 1, 2, 4, 15, 20, 29, 31, 32, 36, 37, 38, 41, 42, 47, 48, 52, 54, 59, 61, 63, 65, 66, 71], "game": [1, 2, 59, 71], "gaug": [33, 52], "gener": [0, 2, 9, 11, 12, 16, 21, 31, 34, 35, 36, 40, 42, 45, 46, 49, 51, 59, 65, 66, 67, 69, 71, 72], "generaliz": 23, "generate_bert": 67, "generate_certainty_pkl": 67, "generate_lexicon_pkl": 67, "generate_vect": 67, "gensim": 40, "get": [16, 20, 21, 28, 30, 31, 36, 49, 66, 67], "get_all_dd_featur": 11, "get_centroid": 66, "get_certainti": 5, "get_certainty_scor": 64, "get_content_words_in_messag": 28, "get_conversation_level_aggreg": 65, "get_cosine_similar": 6, "get_dale_chall_easy_word": [21, 70], "get_dale_chall_score_and_classf": 64, "get_dd": 6, "get_dd_featur": 8, "get_dep_pair": [19, 49], "get_dep_pairs_noneg": [19, 49], "get_discursive_diversity_featur": 65, "get_first_pct_of_chat": 2, "get_first_person_word": [12, 70], "get_forward_flow": [7, 64], "get_function_word": 70, "get_function_words_in_messag": 28, "get_gini": 68, "get_gini_featur": 65, "get_info_divers": 13, "get_info_exchange_wordcount": 12, "get_liwc_count": 14, "get_max": 72, "get_mean": 72, "get_median": 72, "get_mimicry_bert": 28, "get_min": 72, "get_moving_mimicri": 28, "get_named_ent": 64, "get_nan_vector": [27, 67], "get_polarity_scor": 24, "get_politeness_strategi": 17, "get_politeness_v2": 18, "get_proportion_first_pronoun": 16, "get_question_word": 70, "get_reddit_featur": 64, "get_senti": 67, "get_stdev": 72, "get_subjectivity_scor": 24, "get_sum": 72, "get_team_bursti": 4, "get_temporal_featur": [4, 64], "get_time_diff": 23, "get_time_diff_startend": 23, "get_turn": 25, "get_turn_id": 71, "get_turn_taking_featur": 65, "get_unique_pairwise_combo": 6, "get_user_level_aggreg": 65, "get_user_level_summary_statistics_featur": 66, "get_user_level_summed_featur": 66, "get_user_max_datafram": 72, "get_user_mean_datafram": 72, "get_user_median_datafram": 72, "get_user_min_datafram": 72, "get_user_network": [11, 66], "get_user_stdev_datafram": 72, "get_user_sum_datafram": 72, "get_variance_in_dd": 26, "get_within_person_disc_rang": 27, "get_word_ttr": 16, "get_zscore_across_all_chat": 73, "get_zscore_across_all_convers": 73, "gina": 62, "gini": [1, 39, 62, 65, 68], "gini_coeffici": [11, 69], "github": [0, 1, 2, 18, 71], "give": [0, 1, 29, 37], "give_ag": 49, "given": [0, 1, 5, 6, 13, 14, 28, 30, 31, 33, 34, 35, 36, 40, 41, 55, 59, 66, 67, 71], "go": [1, 34, 35, 45, 46, 50, 62], "goal": 62, "good": [50, 56, 62], "goodby": 49, "googl": [0, 1], "got": [31, 36], "gotta": [31, 36], "grade": 33, "grader": 21, "grai": 35, "grammat": 36, "granularli": 35, "grate": 62, "gratitud": [17, 49, 50], "great": [47, 50, 51, 56, 59, 60, 62], "greater": 55, "greet": 50, "groceri": 41, "group": [0, 2, 4, 13, 29, 33, 34, 41, 52, 59, 62, 68, 71, 72], "grouping_kei": [0, 1, 2, 71], "gt": 22, "guess": 10, "gun": 1, "gy": 15, "gym": 34, "ha": [0, 1, 32, 34, 35, 37, 42, 43, 46, 52, 54, 55, 56, 59, 61, 62, 63, 71], "had": [1, 31, 36, 54, 61], "hadn": [31, 36], "handl": [19, 29, 71], "happen": [1, 2, 55, 62, 63], "happi": 42, "harder": 21, "hashedg": [17, 50], "hasn": [31, 36], "hasneg": 50, "hasposit": 50, "hate": 31, "have": [0, 1, 10, 12, 16, 31, 34, 36, 37, 40, 41, 42, 45, 46, 50, 54, 59, 60, 61, 62, 71], "haven": [31, 36], "he": [1, 31, 36], "header": 18, "hear": 32, "heart": [61, 62], "heat": 1, "heavi": 62, "hedg": [11, 30, 39, 49, 50, 64], "hei": [1, 35, 45, 46, 50], "helena": [47, 62], "hello": [0, 43, 49], "help": [0, 31, 34, 36, 43, 45, 46, 52, 58, 69], "helper": [23, 67], "her": [30, 31, 36], "here": [1, 29, 31, 34, 41, 42, 47, 61, 62, 66], "herself": [31, 36], "hesit": [60, 64], "hi": [31, 35, 36, 43, 45, 46], "hierach": 71, "hierarch": 71, "high": [0, 1, 2, 61, 62, 71], "higher": [0, 1, 21, 31, 34, 36, 40, 41, 42, 44, 45, 46, 55, 60], "highest": 71, "highlight": 1, "him": [31, 36], "himself": [31, 36], "hmm": [31, 36], "hoc": 62, "hold": 31, "hole": 62, "home": 42, "homework": 34, "homonym": 31, "hood": 1, "hope": 35, "host": [45, 46], "hour": 48, "how": [1, 5, 28, 29, 30, 31, 34, 35, 36, 39, 43, 45, 51, 52, 54, 56, 62], "howev": [0, 1, 3, 11, 35, 40, 42, 44, 54, 56, 61, 62], "howitwork": 1, "html": [1, 2, 15, 17, 24, 61], "http": [1, 2, 4, 5, 6, 12, 13, 15, 16, 17, 18, 21, 24, 41, 45, 46, 47, 61, 64, 68, 70, 71], "hu": [1, 42, 62], "hug": [1, 51, 61], "huggingfac": 1, "huh": [31, 32, 36], "human": [37, 50, 62], "hyperlink": 48, "hyphen": [1, 61], "hypothet": 42, "i": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 19, 20, 21, 22, 23, 24, 25, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 71, 73], "iby1": 5, "id": [2, 4, 7, 23, 28, 62, 66, 68, 71, 72, 73], "idea": [12, 35, 40, 47, 51], "ident": [34, 35], "identif": 1, "identifi": [0, 1, 2, 4, 8, 9, 15, 23, 25, 30, 31, 41, 47, 50, 52, 61, 63, 64, 71, 72], "identiif": [13, 71], "ignor": [1, 32], "illustr": [1, 41, 48, 62], "imagin": 1, "immedi": [31, 35, 64], "impact": [1, 60], "impersonal_pronoun": 49, "implement": 64, "impli": 37, "import": [31, 32, 36, 44, 45, 62, 69], "incent": 13, "includ": [0, 1, 2, 10, 17, 22, 31, 32, 35, 36, 42, 45, 46, 51, 52, 56, 61, 62, 66, 71], "inclus": [13, 71], "incongru": [8, 34], "incorpor": [1, 42, 45, 46], "increas": [1, 42, 62], "increment": 71, "independ": 1, "index": [1, 2, 4, 13, 25, 37, 39, 55, 61, 65], "indic": [1, 2, 16, 21, 22, 30, 32, 34, 35, 36, 40, 41, 43, 44, 48, 49, 50, 52, 55, 60, 63, 71], "indirect": 50, "indirect_btw": 50, "indirect_greet": 50, "indirectli": 69, "individu": [0, 1, 5, 11, 31, 34, 37, 45, 50, 59, 60, 62, 72], "inequ": 37, "infer": [1, 51, 67], "influenc": 1, "info": [13, 18, 64], "info_divers": 13, "info_exchang": 64, "info_exchange_wordcount": [41, 64], "info_exchange_zscor": 11, "inform": [6, 11, 12, 13, 24, 32, 34, 39, 48, 62, 64, 65], "informal_titl": 49, "information_divers": 11, "initi": [2, 62, 63, 64, 65, 66], "input": [0, 2, 4, 6, 12, 13, 14, 15, 16, 19, 20, 22, 28, 50, 55, 60, 62, 63, 64, 65, 66, 67, 71, 72], "input_data": [25, 68, 72], "input_df": [1, 2, 61, 71], "inquiri": [30, 39, 52], "insid": 1, "insight": 1, "inspir": 15, "instal": [1, 61, 62], "instanc": [1, 22, 50, 59, 66], "instanti": 2, "insteac": 1, "instead": [1, 62], "instruct": [1, 61], "int": [2, 3, 10, 13, 15, 16, 19, 20, 22, 28, 63, 64, 67], "intact": 71, "integ": [0, 13, 40, 47], "intend": 59, "interact": [1, 11, 43, 44, 62, 69], "interconnect": 62, "interest": [1, 61, 62], "interfac": 62, "intermedi": [59, 64], "intern": 29, "interpret": [0, 1, 23], "interrupt": 59, "interv": [58, 65], "introduc": 62, "introduct": [11, 61], "invalid": 67, "invers": 64, "involv": [41, 62, 65], "io": [1, 24, 47, 61], "ipynb": [0, 1], "is_hedged_sentence_1": 10, "isn": [1, 31, 36], "issu": [1, 31, 36, 37, 42, 61], "ital": 64, "italic": 22, "item": [0, 71], "its": [0, 15, 31, 35, 36, 40, 41, 47, 54, 55, 64, 69], "itself": [23, 31, 36, 44], "john": 1, "jonson": 62, "journal": [5, 64], "json": [1, 61], "jurafski": 70, "juri": 1, "juries_df": 1, "jury_conversations_with_outcome_var": 1, "jury_feature_build": 1, "jury_output": 1, "jury_output_chat_level": [1, 61], "jury_output_turn_level": 1, "just": [1, 2, 31, 36, 46, 50, 59, 61, 62], "katharina": 34, "keep": [1, 71], "kei": [1, 2, 4, 19, 28, 30, 54, 61, 71], "keyerror": 71, "keyword": [19, 49], "kind": [10, 62], "kitchen": 42, "knob": 0, "know": [1, 30], "knowledg": 29, "known": [1, 32, 61], "kumar": 62, "kw": 19, "lab": [1, 2, 62, 71], "label": [1, 15, 21, 51], "lack": [31, 38, 45, 46], "languag": [15, 31, 34, 42, 50, 62], "larg": [1, 31, 69], "larger": [0, 31, 61], "last": [1, 31], "late": 32, "later": [0, 1, 2, 42, 61], "latest": [1, 61], "latter": [31, 36], "lda": [13, 40], "learn": [1, 61, 62], "least": [10, 32, 42, 63, 67], "led": 62, "legal": 49, "lemmat": [13, 40], "len": 28, "length": [35, 39, 41, 42, 44], "less": [1, 13, 32, 50, 52, 55, 62, 63], "let": [41, 49, 53], "let_me_know": 49, "letter": [49, 71], "level": [0, 1, 2, 3, 4, 6, 7, 8, 9, 12, 13, 14, 16, 23, 61, 64, 65, 66, 71, 72], "lexic": [1, 10, 12, 14, 16, 31, 32, 36, 42, 60, 62, 64], "lexical_featur": [14, 64], "lexical_features_v2": [10, 11], "lexicon": [5, 10, 14, 30, 39, 50, 52, 67, 69], "lexicons_dict": 67, "librari": [34, 51, 56, 57], "lift": 62, "light": 61, "like": [1, 22, 31, 34, 36, 41, 50, 61, 62], "limiat": 32, "limit": [11, 32, 37, 42, 54], "line": [0, 1, 19, 22, 48, 61, 62, 64], "linear": 64, "linguist": [18, 19, 30, 39, 50, 52], "link": [22, 29, 48, 50, 64], "list": [1, 2, 6, 7, 10, 11, 12, 13, 15, 19, 20, 21, 22, 28, 31, 33, 36, 37, 42, 48, 49, 50, 53, 54, 61, 64, 65, 66, 67, 68, 70, 71], "literatur": 62, "littl": 38, "littlehors": 1, "liu": [42, 52], "live": [1, 54], "liwc": [14, 30, 39, 51, 52, 56, 62], "liwc_featur": [10, 14], "lix": 34, "ll": [1, 31, 36, 61], "load": [19, 69], "load_saved_data": 19, "load_to_dict": 19, "load_to_list": 19, "loc": 15, "local": [1, 51, 61], "locat": [1, 62], "long": [4, 42], "longer": [30, 41, 43, 48, 61, 62], "look": [2, 34, 61, 65, 66], "loos": 36, "lot": [31, 36], "loud": 60, "love": [31, 56], "low": [1, 2, 29, 55, 60, 71], "lower": [0, 1, 21, 31, 33, 36, 41, 42, 44, 55, 60], "lowercas": [2, 13, 40, 48, 49, 71], "lowest": 71, "lpearl": 16, "lst": 6, "m": [0, 2, 23, 30, 31, 36], "made": [1, 23, 35, 59, 61, 62], "magnitud": 55, "mai": [1, 2, 11, 28, 31, 32, 35, 36, 37, 41, 42, 43, 44, 54, 61, 62, 71], "main": [1, 2, 5, 62, 64, 65, 66], "make": [1, 5, 31, 34, 55, 56, 62, 66, 69, 71], "man": 62, "mani": [1, 4, 11, 32, 37, 41, 60, 62, 66], "manner": [55, 62], "manual": [1, 61], "map": [13, 34], "mark": [19, 20, 22, 43, 54, 64, 71], "marker": [18, 32, 39, 42, 50, 51, 52, 54, 56], "marlow": 44, "matarazzo": 62, "match": [1, 5, 16, 19, 30], "math": 34, "matter": [28, 47], "max": [0, 1, 2, 11, 66, 72], "max_num_chunk": 63, "max_user_mean_num_word": 1, "maxim": [34, 35, 37, 72], "maximum": [1, 63, 65, 72], "mayb": [38, 47], "mcfarland": 70, "me": [31, 32, 36, 41, 50, 53], "mean": [0, 1, 2, 4, 6, 11, 13, 21, 29, 31, 34, 36, 40, 41, 42, 47, 55, 56, 58, 61, 62, 65, 66, 72, 73], "mean_num_word": 1, "meaning": [31, 41, 55], "meaningless": 41, "meant": 39, "measur": [0, 1, 7, 12, 13, 20, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 51, 52, 54, 55, 56, 57, 58, 59, 60, 62, 64, 68], "mechan": 32, "median": [0, 1, 72], "medium": 21, "meet": 48, "member": [13, 34, 37, 55], "merg": [2, 8, 65, 66], "merge_conv_data_with_origin": 2, "messag": [0, 1, 2, 3, 4, 5, 8, 11, 12, 13, 14, 15, 16, 17, 20, 21, 22, 23, 24, 28, 30, 31, 34, 35, 36, 37, 39, 41, 45, 46, 47, 48, 50, 51, 52, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 71, 73], "messaga": 61, "message_col": [0, 1, 2, 12, 13, 14, 61, 64, 65, 67, 71], "message_embed": [6, 7, 8], "message_lower_with_punc": 71, "metadata": [0, 1], "method": [1, 5, 31, 41, 50, 62, 65], "metric": [0, 1, 8, 30, 34, 35, 46, 47, 48, 55, 66], "michael": 1, "mid": [1, 2, 71], "middl": [21, 34, 63], "might": [0, 1, 29, 43, 48, 53], "mikeyeoman": [18, 64], "mileston": 34, "millisecond": [0, 2], "mimic": [28, 31, 36, 45], "mimic_word": 28, "mimick": [28, 31, 64], "mimicri": [0, 1, 28, 31, 35, 36, 39, 61, 64], "mimicry_bert": [45, 46], "min": [1, 2, 11, 72], "mind": [1, 35, 50], "mine": [31, 36, 53, 59], "minim": [0, 41, 60], "minimum": [65, 72], "minmiz": 72, "minu": [12, 41, 64], "minut": [55, 58], "mirror": 1, "miss": [1, 32, 61, 71], "mitig": [31, 36], "mizil": [49, 50], "mm": [31, 36], "mnsc": 6, "modal": 50, "mode": 60, "model": [1, 13, 15, 31, 34, 35, 36, 40, 45, 46, 47, 51, 62, 67], "modif": 35, "modifi": [1, 9, 19, 32, 64], "modul": [0, 1, 11, 34, 49, 50, 61, 69], "monologu": 59, "more": [0, 1, 2, 11, 12, 22, 23, 24, 28, 31, 32, 34, 36, 37, 40, 41, 42, 43, 44, 45, 46, 50, 52, 55, 59, 61, 62, 71], "morn": 1, "most": [1, 24, 31, 55, 62, 69], "motiv": 61, "move": [0, 1, 28, 31, 36, 39, 45, 59, 61], "movi": 31, "much": [1, 28, 31, 34, 35, 36, 45, 62], "multi": [1, 2, 71], "multidimension": [45, 46], "multipl": [0, 1, 2, 19, 62, 71], "must": [1, 6, 62, 71], "my": [30, 31, 35, 36, 45, 46, 50, 53], "my_chat_featur": 1, "my_feature_build": 61, "my_fil": 1, "my_output": 61, "my_output_chat_level": 61, "my_output_conv_level": 61, "my_output_user_level": 61, "my_pandas_datafram": 61, "myself": [31, 36, 53], "n": [0, 2, 35, 45, 46, 47, 57, 59, 60], "n_chat": 59, "na": [5, 33, 43, 44, 48, 49, 50, 53, 58], "naiv": [2, 20, 32, 34, 38, 39, 53, 56, 57, 64], "name": [0, 2, 4, 7, 8, 9, 12, 13, 14, 15, 17, 19, 23, 25, 28, 30, 32, 35, 39, 45, 46, 50, 51, 56, 63, 64, 66, 67, 68, 71, 72, 73], "name_to_train": 47, "named_ent": [15, 47], "named_entity_recognition_featur": 11, "nan": [0, 34, 67], "nate": [35, 45, 46], "nathaniel": [35, 45, 46], "nativ": 50, "natur": [43, 55], "ndarrai": 68, "nearest": [13, 40], "nearli": 62, "necessari": [63, 67], "need": [0, 1, 2, 21, 62, 66, 67], "need_sent": 67, "need_senti": 67, "neg": [1, 24, 29, 31, 34, 35, 36, 42, 50, 51, 52, 54, 56, 61, 62, 67], "negat": [19, 49], "negative_bert": [0, 1, 51, 61], "negative_emot": [49, 51, 52, 56], "negoti": 62, "neighborhood": 54, "neither": 30, "ner": 15, "ner_cutoff": [0, 1, 2, 47, 64], "ner_train": 64, "ner_training_df": [0, 1, 2, 47, 64], "nest": [0, 1, 2, 22, 71], "net": [45, 46], "network": 11, "neutral": [1, 5, 24, 30, 51, 55, 61, 67], "neutral_bert": [1, 51, 61], "never": 1, "new": [1, 4, 13, 34, 61, 64, 65, 66, 72], "new_column_nam": 72, "next": [1, 32, 47, 58], "nice": [1, 50, 54, 61], "nicknam": 1, "niculescu": [49, 50], "night": 31, "nikhil": [59, 62], "nltk": [1, 42, 61], "nobodi": [31, 36], "nois": 32, "non": [1, 2, 28, 31, 37, 48, 61, 62, 71], "none": [1, 2, 19, 23, 37, 55, 61, 64, 65, 66, 67], "nor": 30, "normal": [19, 28, 31], "notabl": 62, "note": [0, 2, 12, 16, 20, 42, 61, 71], "notebook": [0, 1], "noth": [31, 36, 56], "noun": 1, "novel": [45, 46], "now": [0, 1], "nowher": [31, 36], "np": [67, 68], "ntri": 32, "null": 34, "num": 48, "num_char": 65, "num_chunk": [27, 63], "num_hedge_word": 10, "num_messag": 65, "num_named_ent": [15, 47], "num_row": 63, "num_top": 13, "num_word": [12, 16, 65], "number": [0, 1, 3, 11, 12, 13, 14, 15, 16, 19, 20, 21, 22, 23, 25, 28, 31, 32, 34, 36, 37, 40, 41, 42, 43, 44, 47, 48, 49, 54, 56, 58, 59, 60, 62, 63, 64, 66, 69, 71, 72], "numer": [0, 1, 2, 11, 13, 33, 68, 72, 73], "numpi": [1, 61, 68], "o": 35, "object": [1, 2, 11, 19, 44, 50, 57, 58, 61, 62, 64, 65, 66], "obtain": [1, 13, 17, 23, 24, 34, 61], "occur": [0, 4, 31, 42, 71], "occurr": 19, "off": [0, 1, 31, 36], "offer": 0, "offici": 61, "often": [28, 36, 47, 48, 62], "oh": [31, 36, 48], "okai": [31, 36], "older": [1, 49, 61], "on_column": [18, 23, 28, 68, 72, 73], "onc": [1, 2, 11, 58, 61, 62], "one": [0, 1, 2, 4, 10, 12, 19, 23, 25, 28, 29, 31, 32, 36, 37, 47, 51, 56, 59, 61, 62, 67, 68, 71, 73], "ones": [31, 36], "onli": [0, 1, 2, 5, 11, 23, 29, 31, 32, 34, 36, 37, 45, 53, 58, 59, 61, 62, 71], "onlin": [1, 32, 39, 64], "onward": 0, "open": [0, 62, 66], "operation": [39, 50, 59], "opinion": [24, 31], "oppos": [2, 31, 34, 35, 55], "opposit": 34, "option": [1, 2, 37, 62, 63, 67, 71], "order": [0, 1, 35, 37, 42, 71], "org": [2, 6, 15, 21, 24, 41, 70], "organ": 1, "origin": [1, 2, 5, 12, 21, 31, 32, 35, 36, 37, 45, 46, 49, 50, 59], "orthogon": 34, "other": [1, 2, 9, 11, 28, 29, 30, 31, 32, 34, 35, 36, 37, 39, 40, 45, 46, 48, 51, 52, 54, 56, 58, 59, 61, 62, 64, 66, 71], "other_lexical_featur": [11, 64], "otherwis": [2, 10, 21, 23, 32, 38, 63, 67], "our": [0, 1, 2, 11, 13, 29, 31, 32, 36, 37, 39, 53, 59, 61, 71], "ourselv": 53, "out": [1, 16, 19, 31, 36, 55, 60, 62], "outcom": [1, 44, 62], "output": [0, 2, 10, 17, 19, 40, 61, 62, 64, 67], "output_file_bas": [0, 1, 2, 61], "output_file_path_chat_level": [1, 2], "output_file_path_conv_level": [1, 2], "output_file_path_user_level": [1, 2], "output_path": 67, "outsid": [1, 2, 12], "over": [1, 16, 29, 31, 34, 35, 36, 37, 53, 55, 60, 62, 71], "overal": [30, 31, 34, 36, 45, 46], "overrid": [0, 1, 2], "overview": [0, 61, 62], "overwhelmingli": 1, "overwritten": 1, "own": [0, 1, 9, 35, 62], "p": 55, "pacakg": 24, "pace": [43, 62], "packag": [17, 18, 40, 62], "pad": 19, "page": [1, 11, 29, 39, 61, 62, 69], "pair": [6, 19, 34, 49, 71], "pairwis": [6, 34], "panda": [0, 1, 2, 12, 14, 16, 23, 47, 64, 65, 66, 71, 72, 73], "paper": [4, 5, 12, 18, 29, 40, 49, 50, 64], "paragraph": 22, "param": 71, "paramet": [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 28, 47, 61, 62, 63, 64, 65, 66, 67, 68, 71, 72, 73], "paramt": 1, "pardon": 32, "parenthes": [22, 48, 64], "parenthet": [22, 48], "pars": [16, 50, 60], "part": [1, 10, 13, 29, 36, 42, 52, 71], "particip": [1, 9, 37, 62], "particl": [31, 36], "particular": [1, 11, 31, 32, 34, 41, 45, 47, 51, 59, 62], "particularli": 42, "partner": 32, "pass": [1, 13, 21, 47, 71], "path": [1, 2, 19, 61, 67], "path_in": 19, "pattern": [4, 11, 19, 55, 62, 67], "paus": 4, "pd": [1, 2, 4, 6, 7, 8, 9, 12, 13, 14, 15, 16, 18, 19, 23, 25, 63, 64, 65, 66, 67, 68, 71], "pdf": [5, 12, 13, 16, 18, 21, 24, 64, 70], "penalti": 1, "pennebak": [12, 37, 41, 42, 52], "pennyslvania": 62, "peopl": [1, 32, 59, 62], "per": [1, 6, 9, 19, 42, 63, 66, 72], "percentag": [2, 21], "perfect": [37, 59], "perform": [0, 1, 16, 50], "perhap": 1, "period": [4, 34, 55], "person": [1, 8, 12, 15, 16, 32, 34, 39, 41, 42, 50, 59, 62, 64, 70], "perspect": 1, "petrocelli": 5, "phrase": [19, 30, 38, 54], "phrase_split": 19, "pickl": [19, 67], "piec": [36, 42, 59, 63], "pl": 50, "place": [55, 61, 62], "plan": [34, 35, 45, 46], "player": 59, "pleas": [0, 1, 38, 49, 50, 61, 62], "please_start": 50, "point": [22, 24, 34, 35, 45, 46, 48, 52, 64, 66], "poisson": 55, "polar": [24, 39, 51, 52, 64], "polit": [1, 17, 18, 30, 32, 38, 39, 42, 51, 52, 54, 56, 64], "politeness_featur": 11, "politeness_v2": 11, "politeness_v2_help": 11, "politenessstrategi": [17, 50], "portion": 0, "posit": [0, 1, 11, 15, 24, 29, 31, 39, 42, 50, 51, 54, 56, 61, 62, 64, 67], "positive_affect_lexical_per_100": [51, 52, 56], "positive_bert": [0, 1, 51, 61], "positive_emot": [49, 51, 52, 56], "positivity_bert": [1, 61], "positivity_zscor": 64, "positivity_zscore_chat": 52, "positivity_zscore_convers": 52, "possess": 31, "possibl": [1, 34, 62, 66], "possibli": [38, 62], "practic": [34, 35], "pre": [1, 4, 21, 37, 49, 64], "preced": [31, 35, 71], "precend": 35, "precis": 47, "precomput": 51, "predefin": 19, "predetermin": [31, 36], "predict": [2, 47, 51, 64], "prefer": [0, 1], "preload_word_list": 69, "prep_simpl": 19, "prep_whol": 19, "preposit": [31, 36], "preproces": 48, "preprocess": [0, 1, 2, 13, 19, 40, 43, 49, 51, 61, 69], "preprocess_chat_data": 2, "preprocess_conversation_column": 71, "preprocess_naive_turn": 71, "preprocess_text": 71, "preprocess_text_lowercase_but_retain_punctu": 71, "presenc": [2, 32, 67], "present": [1, 2, 14, 30, 31, 38, 42, 55, 62, 71], "prespecifi": 19, "prevent": 51, "previou": [1, 7, 28, 31, 36, 45, 46, 58, 64, 71], "primari": 34, "print": 2, "prior": [2, 64, 71], "priya": [47, 62], "probabl": [15, 47], "problem": 62, "procedur": 62, "proceed": 46, "process": [0, 1, 2, 4, 10, 21, 37, 55, 62, 64, 65, 67, 69, 71], "prodi": 15, "produc": [1, 2, 34], "product": 15, "professor": 62, "progress": [1, 2], "project": [54, 62], "pronoun": [12, 16, 31, 36, 39, 41, 42, 64, 70], "proper": 1, "properti": [1, 11, 61], "proport": [16, 39, 42, 64], "propos": 37, "provid": [0, 1, 2, 15, 29, 30, 33, 36, 39, 44, 47, 54, 62], "proxi": 42, "pseudonym": 1, "psycholog": 42, "pub": 70, "publish": [5, 30, 64], "pubsonlin": 6, "punctuat": [0, 2, 16, 19, 20, 21, 28, 43, 54, 60, 71], "punctuation_seper": 19, "puncut": 48, "pure": [24, 36], "purpos": 1, "put": [34, 50, 62, 66], "py": [0, 1, 14, 49, 61], "pydata": 2, "pypi": [1, 61], "python": [1, 32, 41, 56, 57, 61, 62, 68], "qtd": 62, "qualiti": 41, "quantifi": [31, 36, 62], "quantiti": [37, 39, 41, 47], "quartil": 50, "question": [16, 19, 20, 29, 32, 39, 49, 50, 64, 66, 68, 70], "question_num": 11, "question_word": 20, "quick": [1, 43], "quickli": 0, "quit": 40, "quot": [22, 48, 64], "quotat": [22, 48], "rabbit": 62, "rain": 41, "rais": [67, 71], "random": 55, "rang": [5, 8, 24, 30, 33, 34, 35, 40, 51, 53, 55, 56, 57], "ranganath": [16, 31, 32, 36, 38, 43, 54, 70], "ranganath2013": 70, "ranganathetal2013_detectingflirt": 16, "rapid": [1, 4], "rare": [34, 35], "rate": [42, 51], "rather": [1, 31, 34, 35, 36, 37, 45, 46, 63], "ratio": [16, 39, 64], "raw": [0, 12, 16, 21, 31, 33, 42, 50, 64], "re": [1, 31, 36, 42, 50, 61], "read": [0, 1, 2, 16, 21, 29, 33, 61, 62, 64, 65, 66, 67], "read_csv": 1, "read_in_lexicon": 67, "readabl": [11, 33, 64, 70], "reader": 33, "readi": 1, "readili": 62, "readthedoc": [1, 24, 61], "real": [1, 55], "realit": 13, "realli": [31, 36, 50], "reason": [31, 36, 45, 46, 49], "reassur": 49, "recal": 47, "recent": 50, "recept": [18, 32, 39, 42, 50, 51, 52, 54, 56, 62, 64], "recogn": [1, 43, 47], "recognit": [0, 2, 39, 64], "recommend": [0, 42, 62], "reddit": [48, 64], "reddit_tag": 11, "redditus": 48, "reduc": 63, "reduce_chunk": 63, "redund": [42, 62], "refer": [0, 1, 2, 11, 22, 24, 28, 31, 42, 48, 52, 61, 62, 64, 70], "reflect": [37, 43], "regardless": 1, "regener": [0, 2, 51, 67], "regenerate_vector": [0, 1, 2, 67], "regex": [14, 16, 49], "regist": 37, "regress": 1, "regular": [5, 14, 30, 32, 42, 55, 58], "reichel": [53, 58, 60], "reidl": [4, 13], "reinvent": 62, "rel": [41, 51, 52, 55, 60, 64], "relat": [1, 61, 62, 64], "relationship": 36, "relev": [1, 29, 42, 44, 49, 51, 56, 61, 64, 65], "reli": [31, 34, 35, 36, 69], "reliabl": [33, 42], "remain": [1, 30, 71], "rememb": 1, "remov": [0, 2, 9, 13, 19, 28, 40, 43, 48, 49, 50, 71], "remove_active_us": 9, "renam": 1, "repair": [16, 39], "repeat": [60, 71], "repetit": 60, "replac": 19, "report": [1, 61], "repres": [2, 4, 6, 7, 11, 13, 23, 31, 34, 36, 42, 45, 46, 66, 67, 68, 71, 72, 73], "represent": [34, 38, 67], "reproduc": [36, 62], "republican": 1, "request": [32, 50, 51], "requir": [0, 1, 20, 21, 31, 55, 61, 62, 64, 65, 66, 67], "research": [1, 62], "reserv": 0, "resolv": 62, "resourc": [1, 39, 48, 61, 62], "respect": [1, 2, 12, 31, 36, 37, 69], "respons": [22, 48, 55, 58, 64], "restaur": [34, 56], "restor": 0, "restrict": 71, "result": [40, 55, 65, 72], "retain": [2, 16, 20, 21, 60, 71], "retriev": 50, "retunr": 3, "return": [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 28, 30, 32, 43, 49, 50, 51, 55, 62, 63, 64, 65, 66, 67, 68, 70, 71, 72, 73], "reveal": 62, "revert": 50, "review": 62, "rewrit": 50, "rich": 62, "riedl": [13, 40, 55], "right": [31, 36, 61, 62], "roberta": [1, 39, 42, 52, 56, 61, 64, 67], "robust": 13, "rocklag": [5, 30, 64], "room": 59, "root": [13, 40], "rough": [12, 54], "roughli": 31, "round": [13, 40, 59, 71], "round_num": 1, "row": [0, 1, 2, 9, 13, 25, 37, 40, 59, 63, 68, 71, 72, 73], "rowbotham": 62, "rucker": 5, "rule": [1, 69], "run": [0, 10, 12, 16, 35, 46, 47, 48, 51, 61, 69], "runtim": [1, 35], "sagepub": [5, 64], "sai": [1, 32, 50, 59], "said": [1, 36, 62], "same": [0, 1, 2, 31, 34, 37, 45, 48, 52, 59, 60, 62, 71], "sampl": [61, 62], "sarcast": 48, "save": [0, 1, 2, 19, 64, 67], "save_featur": 2, "sbert": [1, 28, 31, 34, 35, 36, 45, 46, 64, 65, 67], "scale": [42, 51], "schema": 1, "scheme": 0, "school": [21, 62], "scienc": [29, 39, 62], "scientist": [61, 62], "score": [1, 4, 5, 11, 12, 13, 15, 21, 24, 28, 29, 30, 31, 34, 35, 36, 38, 39, 40, 45, 46, 47, 50, 51, 53, 56, 57, 61, 64, 65, 67, 73], "script": [1, 61], "sea": 1, "seamless": 62, "search": [19, 61], "second": [0, 1, 4, 34, 42, 58, 59], "second_person": 49, "secr": [18, 49, 50, 64], "section": [1, 29, 61], "see": [0, 1, 2, 11, 30, 34, 38, 41, 45, 46, 47, 55, 62, 71], "seek": [5, 62], "segment": [0, 19], "select": [2, 4, 23, 28, 36, 45, 66, 67, 68, 71, 72, 73], "self": [1, 2, 61], "semant": [31, 34, 35, 41], "semantic_group": [1, 61], "send": [1, 37, 55], "sens": [1, 5, 31, 54, 66], "sensibl": 1, "sent": [1, 37, 64], "sentenc": [0, 1, 10, 15, 19, 20, 21, 33, 34, 35, 36, 42, 45, 46, 47, 48, 54, 56, 61, 67], "sentence_pad": 19, "sentence_split": 19, "sentence_to_train": 47, "sentencis": 19, "sentiment": [0, 1, 24, 31, 39, 42, 52, 56, 61, 62, 64, 67], "separ": [1, 2, 19, 34, 51], "sepcifi": 1, "septemb": 40, "sequenc": [1, 59], "sequenti": 1, "seri": [12, 16, 23, 28, 42, 71, 73], "serv": 12, "set": [0, 1, 2, 13, 23, 34, 48, 59], "set_self_conv_data": 2, "sever": [1, 30, 41, 42, 48, 51, 56, 61], "shall": 54, "share": [31, 36, 37], "she": [30, 31, 36], "shift": 34, "shop": 62, "short": [55, 58], "shorter": [13, 40, 41, 42, 43], "should": [0, 1, 2, 4, 14, 23, 28, 29, 31, 36, 47, 48, 54, 61, 62, 66, 67, 68, 69, 71, 72, 73], "shouldn": [31, 36], "show": [1, 37, 61], "showeth": 62, "shruti": [35, 45, 46, 47, 62], "side": 31, "signal": [45, 55], "signifi": 42, "signific": [1, 61], "silent": 37, "similar": [1, 6, 7, 13, 28, 29, 31, 34, 35, 36, 40, 45, 46, 49, 50, 62, 65], "similarli": [1, 35], "simpl": [0, 1, 16, 19, 42, 61, 62], "simpli": [1, 5, 11, 28, 56, 62], "simplifi": 1, "simplist": 41, "sinc": [1, 32, 41, 71], "singh": 62, "singl": [0, 1, 2, 11, 12, 19, 23, 31, 34, 35, 36, 37, 41, 45, 46, 59, 62, 71, 72], "singular": [12, 41, 64], "site": 16, "situat": 37, "size": [1, 13, 63, 67], "skip": 1, "slightli": [32, 62, 63], "slow": 1, "small": 40, "so": [1, 2, 10, 30, 31, 36, 37, 50, 61, 62, 66], "social": [29, 39, 61, 62], "socsci": 16, "softwar": 62, "sohi": 62, "sol3": 4, "solut": [1, 59], "solv": 62, "some": [0, 1, 11, 17, 29, 32, 34, 35, 37, 41, 61, 63], "somebodi": [31, 36], "someon": [22, 29, 31, 36, 47, 48, 61, 64], "someplac": [31, 36], "someth": 47, "sometim": 1, "somewhat": 35, "soon": 62, "sorri": [16, 32, 50], "sort": 10, "sound": [47, 51], "sourc": [4, 5, 6, 12, 13, 16, 17, 21, 34, 35, 50, 64, 68], "space": [34, 40, 71], "spaci": [1, 19, 47, 49, 50, 61], "span": 63, "spars": 32, "speak": [1, 31, 36, 37, 59, 60, 62], "speaker": [0, 1, 2, 6, 8, 9, 25, 31, 34, 35, 37, 38, 42, 45, 46, 61, 66, 71, 72], "speaker_id": [2, 61, 72], "speaker_id_col": [0, 1, 2, 6, 8, 9, 25, 26, 27, 61, 65, 66, 71, 72], "speaker_nicknam": [0, 1, 2, 6, 9, 59, 66, 71], "special": [0, 1, 2, 48, 71], "specif": [1, 2, 12, 32, 41, 48, 55, 61, 62, 69, 71], "specifi": [1, 2, 19, 47, 49, 65, 66, 67, 68, 71, 72, 73], "speciifc": 63, "spend": [51, 62], "spike": 55, "split": [19, 21, 43, 63], "spoke": 59, "spoken": [11, 37], "spread": 55, "squar": [13, 40], "ssrn": 4, "stabl": 40, "stack": 14, "stackoverflow": 68, "stage": [1, 2, 34, 71], "stamp": 55, "standard": [1, 4, 37, 40, 41, 49, 55, 58, 60, 65, 72, 73], "stanford": 70, "start": [15, 19, 20, 22, 23, 50], "statement": [1, 38, 42, 47, 48, 61, 62, 64], "statist": [1, 65, 66, 68], "statologi": 41, "stdev": [1, 2, 11, 65, 66], "stem": 42, "step": [1, 4, 28, 41, 45, 46, 51], "still": [1, 41, 45, 46], "stochast": 40, "stop": [40, 62], "stopword": [13, 19], "store": [1, 12, 16, 41, 49, 51, 61, 65, 67], "stoword": 42, "str": [2, 3, 4, 5, 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 28, 63, 64, 65, 66, 67, 68, 71, 72, 73], "str_to_vec": 67, "str_vec": 67, "straightforward": 29, "strategi": [17, 30, 32, 38, 39, 42, 49, 54, 64], "stream": 35, "strictli": 1, "string": [0, 1, 2, 4, 8, 12, 13, 14, 19, 23, 24, 50, 66, 67, 68, 71, 72, 73], "strongli": [1, 41, 61], "structur": [0, 36, 49], "student": [21, 33], "studi": [1, 34, 62], "style": [1, 31, 36, 59], "sub": [0, 1, 71], "subfold": 1, "subject": [5, 24, 28, 39, 49, 64], "subjunct": 50, "sublist": 28, "submiss": 55, "subpart": [1, 71], "subsequ": [1, 30, 51, 58], "subset": 62, "substanc": 36, "substant": 31, "substanti": 1, "substr": 30, "subtask": 1, "subtract": [41, 58], "succe": 62, "success": [0, 1, 4, 31, 36, 43, 55, 58], "suggest": [1, 13, 34, 42, 44, 50], "suit": [62, 64], "sum": [1, 28, 34, 61, 64, 65, 66, 72], "summar": [0, 1, 69], "summari": [65, 66, 72], "summariz": [0, 65], "summarize_featur": 69, "suppl": 6, "support": [1, 15, 61], "suppos": 1, "sure": 30, "swear": 49, "syntax": [1, 32, 61], "system": [2, 59, 64], "t": [0, 1, 15, 29, 31, 36, 45, 49, 54, 61, 62, 67], "tabl": [1, 62], "tag": 39, "take": [1, 4, 5, 9, 14, 25, 29, 31, 34, 37, 39, 42, 55, 61, 65, 67, 71], "taken": [59, 71], "talk": [1, 37, 47, 59, 62], "tandem": [1, 61], "target": 15, "task": [1, 2, 59, 71], "tausczik": [12, 37, 41, 52], "tausczikpennebaker2013": 12, "team": [0, 1, 4, 11, 12, 13, 34, 39, 40, 59, 65], "team_bursti": 4, "team_comm_tool": [1, 61], "teamcommtool": 1, "technic": [29, 39, 61, 62], "teghxgbqdhgaaaaa": 5, "tempor": [0, 2, 55, 58, 64, 71], "temporal_featur": 11, "tend": [1, 34, 60], "term": [1, 28, 59], "termin": [1, 2, 61], "terribl": 51, "test": [13, 33, 47], "text": [0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 28, 32, 33, 36, 42, 48, 55, 62, 64, 67, 71], "text_based_featur": 64, "textblob": [24, 39, 51, 52, 64], "textblob_sentiment_analysi": 11, "than": [0, 1, 2, 11, 13, 31, 34, 35, 36, 37, 40, 41, 45, 46, 54, 60, 62, 63], "thee": 62, "thei": [0, 1, 11, 28, 29, 31, 34, 36, 37, 39, 42, 47, 58, 59, 61, 62, 67], "them": [0, 1, 2, 19, 28, 29, 31, 36, 50, 51, 55, 59, 61, 62, 64, 65, 66, 67], "themselv": [31, 36, 60], "theoret": 35, "theori": [34, 50], "therefor": [0, 1, 11, 28, 37, 45, 59, 62, 69], "thi": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 18, 20, 21, 23, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 71, 72, 73], "thing": [48, 61], "think": [1, 38, 47], "thorough": [43, 62], "those": [1, 21, 31, 36, 61], "though": [34, 42, 50], "thought": [1, 35, 45], "thread": [1, 61], "three": [0, 1, 2, 22, 34, 37, 40, 51, 61, 62, 69, 71], "threshold": [15, 47], "through": [1, 45, 46, 50, 61, 62], "throughout": [31, 35, 36, 40, 45, 46, 55, 63], "tht": 35, "thu": [1, 34, 35, 36, 37, 46, 55, 71], "time": [0, 1, 4, 23, 34, 35, 39, 42, 48, 51, 55, 59, 61, 62, 63, 64, 65, 66, 71], "time_diff": 55, "timediff": 4, "timestamp": [0, 1, 2, 8, 23, 58, 61, 62, 63, 64, 71], "timestamp_col": [0, 1, 2, 8, 61, 63, 64, 65, 71], "timestamp_end": [1, 23, 61, 64], "timestamp_start": [1, 23, 61, 64], "timestamp_unit": [0, 2, 23, 64], "to_datetim": [0, 2], "todai": [34, 35, 41, 43, 45, 46, 47], "todo": 66, "togeth": [0, 62, 66], "token": [16, 19, 39, 49, 54, 64], "token_count": [19, 49], "too": [30, 31, 36, 62], "took": [1, 59], "tool": [1, 61, 62], "toolkit": [0, 1, 11, 42, 45, 46, 55, 62, 65, 66], "top": [1, 50, 59], "topic": [1, 13, 31, 34, 40, 42, 43, 65], "tormala": 5, "total": [0, 1, 3, 12, 16, 25, 31, 34, 36, 37, 41, 44, 53, 59, 60, 61, 62, 63, 64, 66, 72], "touch": [1, 61], "toward": [31, 36, 38, 42, 45, 46], "track": [65, 66], "tradit": 49, "train": [1, 2, 15, 64], "train_spacy_n": 15, "transcript": 0, "transfom": [45, 46], "transform": [1, 31, 34, 35, 36, 51], "transform_utter": 50, "treat": [0, 1, 59, 61], "tri": [50, 64], "trivial": [3, 44, 62], "troubl": [1, 61], "true": [0, 1, 2, 37, 61, 63, 65, 66, 67, 71], "truncat": 2, "truth_intensifi": 49, "ttr": 64, "tupl": [0, 1, 2, 15, 19, 64], "turn": [0, 2, 25, 28, 31, 32, 37, 39, 61, 64, 65, 71], "turn_count": 59, "turn_df": 71, "turn_id": 71, "turn_taking_featur": 11, "twice": 63, "twitter": [1, 51, 61], "two": [0, 1, 2, 23, 31, 34, 36, 41, 45, 46, 52, 62, 63], "txt": 19, "type": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 28, 37, 39, 52, 63, 64, 65, 66, 67, 68, 70, 71, 72, 73], "typic": [1, 34, 40, 41, 42, 52, 60], "u": [0, 1, 2, 22, 31, 36, 48, 49, 58], "uci": 16, "uh": [31, 36], "ulrich": 55, "um": [31, 36, 60], "umbrella": [8, 29, 34], "uncertain": [5, 30], "uncertainti": 30, "under": [0, 1, 10, 11, 12, 28, 40], "underli": [1, 61], "underscor": [1, 61], "understand": [0, 33, 39, 43, 48, 58, 61, 62], "understood": 33, "uninterrupt": 59, "uniqu": [0, 1, 2, 6, 9, 13, 16, 23, 25, 41, 47, 52, 60, 61, 63, 71], "unit": [0, 2, 23], "univers": 62, "unix": 58, "unless": [31, 36], "unpack": 62, "unpreprocess": 0, "until": [31, 36, 45, 46], "unzip": [1, 61], "up": [1, 17, 21, 28, 31, 35, 36, 37, 45, 46, 51, 59, 61], "updat": [1, 9, 40, 54, 61], "upenn": 1, "upgrad": 50, "upload": 13, "upon": 33, "upper": 42, "us": [0, 1, 2, 3, 5, 11, 12, 13, 17, 19, 24, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 56, 57, 60, 62, 64, 65, 66, 67, 71], "usag": [0, 21, 24], "use_time_if_poss": 63, "user": [0, 1, 2, 9, 15, 22, 37, 47, 48, 51, 61, 62, 63, 64, 65, 66, 69, 72], "user_aggreg": [0, 1, 2, 65, 66], "user_column": [0, 1, 2, 65, 66], "user_data": [2, 65, 66], "user_df": 9, "user_level_featur": 2, "user_list": 9, "user_method": [0, 1, 2, 65, 66], "userlevelfeaturescalcul": [2, 66, 69], "usernam": [22, 48], "utf": 1, "util": [1, 12, 21, 61, 62, 63, 64, 65, 66, 67, 68, 70, 71, 72, 73], "utilti": 62, "utter": [0, 1, 2, 3, 4, 5, 13, 14, 15, 16, 17, 20, 21, 23, 24, 30, 31, 32, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 50, 51, 52, 54, 58, 60, 61, 67], "utteranc": 42, "v": [0, 1, 13, 42, 61], "v0": 0, "valenc": 51, "valid": [1, 23, 55], "valu": [0, 1, 2, 5, 6, 10, 12, 13, 18, 19, 23, 28, 30, 31, 34, 36, 37, 40, 41, 42, 45, 46, 47, 55, 59, 61, 64, 67, 68, 71, 72, 73], "vari": [13, 31, 34, 35], "variabl": [1, 56, 57, 64, 65, 66], "varianc": [1, 8, 34], "variance_in_dd": 11, "variat": [4, 32], "varieti": [42, 62], "variou": [19, 42, 64, 65, 66], "vast": 62, "ve": [0, 31, 36, 50, 61], "vec": 6, "vect_data": [1, 7, 8, 28, 61, 64, 65, 66], "vect_path": 67, "vector": [0, 2, 6, 7, 8, 13, 28, 34, 35, 40, 55, 61, 64, 65, 67], "vector_data": [0, 1, 2, 61], "vector_directori": [0, 1, 2, 61, 65], "vein": 45, "verb": [19, 31, 36], "verbal": 32, "veri": [5, 28, 30, 31, 34, 35, 36, 42, 49, 54], "verifi": 2, "verit": 62, "version": [1, 12, 14, 21, 28, 31, 40, 50, 51, 61], "versu": [4, 29, 47, 55, 59], "via": [3, 44], "view": 50, "visit": 41, "voila": 62, "w": 31, "wa": [0, 1, 2, 5, 12, 31, 32, 35, 36, 47, 51, 56, 59, 62, 71], "wai": [0, 1, 2, 29, 30, 31, 32, 34, 49, 50, 54, 56, 57, 61, 62, 66], "waiai": 62, "wait": [4, 55], "walk": 1, "walkthrough": [0, 61, 62], "want": [1, 28, 34, 59, 61, 62, 65, 66, 67], "warn": [1, 50], "watt": [1, 2, 62, 71], "we": [0, 1, 2, 3, 4, 5, 9, 10, 11, 12, 15, 16, 18, 23, 24, 28, 29, 30, 31, 34, 35, 36, 37, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 52, 53, 55, 56, 57, 58, 59, 61, 62, 66, 67, 71], "web": 70, "websit": [1, 61], "week": 47, "weight": 66, "weigt": 31, "welcom": 61, "well": [11, 29, 31, 36, 55, 62], "went": 41, "were": [1, 2, 12, 31, 36, 42], "western": 1, "wh": [19, 31, 36], "wh_question": [32, 49, 54], "wharton": 62, "what": [1, 2, 12, 16, 20, 29, 31, 32, 34, 35, 36, 39, 41, 45, 46, 47, 50, 54, 62, 63], "whatev": [1, 31, 36], "wheel": 62, "when": [1, 16, 20, 31, 33, 36, 47, 54, 55, 59, 60, 61, 62, 69, 71], "whenev": 71, "where": [1, 2, 19, 20, 28, 31, 32, 36, 37, 40, 41, 42, 48, 50, 51, 54, 59, 61, 65, 68, 73], "wherea": [31, 34, 35, 36, 43], "wherev": [31, 36], "whether": [1, 2, 10, 16, 19, 32, 37, 38, 41, 43, 47, 57, 58, 62, 63, 64, 67, 71], "which": [0, 1, 2, 3, 4, 5, 7, 9, 12, 13, 15, 16, 18, 23, 25, 28, 31, 34, 35, 36, 37, 38, 40, 41, 42, 51, 53, 54, 55, 56, 57, 58, 59, 61, 62, 64, 65, 66, 68, 69, 71, 72, 73], "while": [1, 31, 32, 34, 36, 37, 44, 45, 46, 55, 62, 71], "whitespac": 43, "who": [1, 20, 31, 32, 36, 47, 51, 54, 59, 60, 62], "whole": [28, 59, 62, 71], "whom": [31, 36, 54], "whose": [31, 36, 54], "why": [20, 29, 31, 36, 54], "wide": 31, "wien": 62, "wiki": [21, 29, 70], "wiki_link": [1, 61], "wikipedia": [21, 33, 37, 70], "williamson": 60, "wish": [1, 2, 18, 28], "within": [0, 1, 2, 8, 11, 16, 28, 30, 31, 34, 35, 36, 41, 45, 46, 52, 55, 59, 60, 62, 63, 64, 68, 71, 73], "within_group": 2, "within_person_discursive_rang": 11, "within_task": [0, 1, 2, 71], "without": [1, 19, 31, 36, 42, 47, 54, 62, 69], "won": [0, 31, 36, 45], "wonder": 56, "woolei": 4, "woollei": [13, 40, 55], "wooten": 55, "word": [0, 1, 3, 10, 11, 12, 13, 14, 16, 19, 20, 21, 22, 28, 30, 32, 33, 37, 38, 39, 40, 41, 43, 45, 46, 48, 49, 52, 53, 54, 56, 57, 62, 64, 65, 66, 69, 70], "word_mimicri": 11, "word_start": [19, 49], "wordcount": 1, "wordnet": [1, 61], "words_in_lin": 19, "work": [0, 11, 47, 50, 55, 61, 62], "world": 55, "worri": 62, "would": [1, 29, 31, 34, 35, 36, 37, 42, 50, 54, 62], "wouldn": [31, 36], "wow": 50, "wp": 13, "write": [2, 29, 60], "www": [12, 13, 18, 41, 64], "x": [0, 1, 2, 4, 46, 68], "xinlan": 62, "yashveer": 62, "ye": 19, "yeah": [31, 36], "yeoman": [18, 49, 50], "yesno_quest": [32, 49, 54], "yet": 48, "ylatau": 12, "you": [0, 1, 2, 11, 24, 29, 31, 36, 37, 43, 47, 50, 59, 61, 62, 69], "your": [0, 29, 31, 32, 36, 37, 50, 59, 61, 62], "yourself": [31, 36, 50], "yuluan": 62, "yup": [31, 36], "yuxuan": 62, "z": [12, 39, 49, 51, 64, 73], "zero": [13, 52], "zhang": 62, "zheng": 62, "zhong": 62, "zhou": 62, "zscore": 41, "zscore_chat": 41, "zscore_chats_and_convers": 69, "zscore_convers": 41, "\u00bc": 47, "\u03c4": 55}, "titles": ["The Basics (Get Started Here!)", "Worked Example", "feature_builder module", "basic_features module", "burstiness module", "certainty module", "discursive_diversity module", "fflow module", "get_all_DD_features module", "get_user_network module", "hedge module", "Features: Technical Documentation", "info_exchange_zscore module", "information_diversity module", "lexical_features_v2 module", "named_entity_recognition_features module", "other_lexical_features module", "politeness_features module", "politeness_v2 module", "politeness_v2_helper module", "question_num module", "readability module", "reddit_tags module", "temporal_features module", "textblob_sentiment_analysis module", "turn_taking_features module", "variance_in_DD module", "within_person_discursive_range module", "word_mimicry module", "FEATURE NAME", "Certainty", "Content Word Accommodation", "Conversational Repair", "Dale-Chall Score", "Discursive Diversity", "Forward Flow", "Function Word Accommodation", "Gini Coefficient", "Hedge", "Features: Conceptual Documentation", "Information Diversity", "Information Exchange", "Linguistic Inquiry and Word Count (LIWC) and Other Lexicons", "Message Length", "Message Quantity", "Mimicry (BERT)", "Moving Mimicry", "Named Entity Recognition", "Online Discussion Tags", "Politeness/Receptiveness Markers", "Politeness Strategies", "Sentiment (RoBERTa)", "Positivity Z-Score", "Proportion of First Person Pronouns", "Question (Naive)", "Team Burstiness", "Textblob Polarity", "Textblob Subjectivity", "Time Difference", "Turn Taking Index", "Word Type-Token Ratio", "The Team Communication Toolkit", "Introduction", "assign_chunk_nums module", "calculate_chat_level_features module", "calculate_conversation_level_features module", "calculate_user_level_features module", "check_embeddings module", "gini_coefficient module", "Utilities", "preload_word_lists module", "preprocess module", "summarize_features module", "zscore_chats_and_conversation module"], "titleterms": {"A": 0, "One": 0, "The": [0, 61, 62], "accommod": [31, 36], "addit": 1, "advanc": 1, "aggreg": [1, 11], "analyz": 1, "assign_chunk_num": 63, "assumpt": 0, "base": 11, "basic": [0, 1, 29, 30, 31, 33, 34, 35, 36, 37, 38, 40, 41, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 55, 56, 57, 58, 59, 60], "basic_featur": 3, "bert": 45, "bursti": [4, 55], "cach": 1, "calculate_chat_level_featur": 64, "calculate_conversation_level_featur": 65, "calculate_user_level_featur": 66, "caveat": [1, 29, 30, 31, 33, 34, 35, 36, 38, 40, 41, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 55, 56, 57, 58, 59], "certainti": [5, 30], "chall": 33, "chat": [11, 39], "check_embed": 67, "citat": [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60], "class": 69, "code": [0, 1], "coeffici": 37, "column": [1, 61], "commun": 61, "conceptu": 39, "configur": 1, "consider": 1, "content": [31, 61], "convers": [1, 11, 32, 39, 62, 69], "count": [42, 59], "cumul": 1, "custom": 1, "customiz": 0, "dale": 33, "data": 1, "declar": 61, "demo": [0, 1], "detail": 1, "differ": 58, "directori": 1, "discurs": 34, "discursive_divers": 6, "discuss": 48, "divers": [34, 40], "document": [11, 39, 62], "driver": 69, "entiti": [1, 47], "environ": [1, 61], "exampl": [1, 41, 47], "exchang": 41, "featur": [1, 11, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 69], "feature_build": 2, "featurebuild": [1, 61, 62], "fflow": 7, "file": [1, 30, 34, 35, 45, 46, 47, 51], "first": [1, 53], "flow": 35, "forward": 35, "function": [0, 36], "gener": [1, 61, 62], "get": [0, 1, 61, 62], "get_all_dd_featur": 8, "get_user_network": 9, "gini": 37, "gini_coeffici": 68, "group": 1, "hedg": [10, 38], "here": 0, "high": [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60], "implement": [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60], "import": [1, 61], "index": 59, "indic": 61, "info_exchange_zscor": 12, "inform": [1, 40, 41, 61], "information_divers": 13, "input": [1, 34], "inquiri": 42, "inspect": [1, 61], "interpret": [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60], "introduct": 62, "intuit": [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60], "kei": 0, "length": 43, "level": [11, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 62, 69], "lexical_features_v2": 14, "lexicon": 42, "light": 0, "linguist": 42, "liwc": 42, "marker": 49, "messag": [43, 44], "mimicri": [45, 46], "modul": [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 63, 64, 65, 66, 67, 68, 70, 71, 72, 73], "motiv": 62, "move": 46, "naiv": 54, "name": [1, 29, 47, 61], "named_entity_recognition_featur": 15, "note": [1, 29, 30, 31, 33, 34, 35, 36, 38, 40, 41, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 55, 56, 57, 58, 59], "onlin": 48, "other": [42, 69], "other_lexical_featur": 16, "ouput": 34, "our": 62, "output": [1, 30, 35, 45, 46, 47, 51], "overview": 1, "packag": [0, 1, 61], "paramet": [0, 1], "percentag": 1, "person": 53, "pip": [1, 61], "polar": 56, "polit": [49, 50], "politeness_featur": 17, "politeness_v2": 18, "politeness_v2_help": 19, "posit": 52, "preload_word_list": 70, "preprocess": 71, "pronoun": 53, "proport": 53, "quantiti": 44, "question": 54, "question_num": 20, "ratio": 60, "readabl": 21, "recept": 49, "recognit": [1, 47], "recommend": [1, 61], "reddit_tag": 22, "regener": 1, "relat": [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60], "repair": 32, "roberta": 51, "run": 1, "sampl": [0, 1], "score": [33, 41, 52], "sentiment": 51, "speaker": [11, 59, 62, 69], "start": [0, 1, 61, 62], "strategi": 50, "subject": 57, "summarize_featur": 72, "tabl": 61, "tag": 48, "take": 59, "team": [55, 61, 62], "technic": 11, "temporal_featur": 23, "textblob": [56, 57], "textblob_sentiment_analysi": 24, "time": 58, "token": 60, "toolkit": 61, "touch": 0, "train": 47, "troubleshoot": [1, 61], "turn": [1, 59], "turn_taking_featur": 25, "type": 60, "us": 61, "usag": 1, "user": 11, "util": 69, "utter": [11, 39, 62, 69], "variance_in_dd": 26, "vector": 1, "virtual": [1, 61], "walkthrough": 1, "within_person_discursive_rang": 27, "word": [31, 36, 42, 60], "word_mimicri": 28, "work": 1, "your": 1, "z": [41, 52], "zscore_chats_and_convers": 73}}) \ No newline at end of file diff --git a/docs/build/html/utils/calculate_conversation_level_features.html b/docs/build/html/utils/calculate_conversation_level_features.html index 6b1355b2..b377c9a3 100644 --- a/docs/build/html/utils/calculate_conversation_level_features.html +++ b/docs/build/html/utils/calculate_conversation_level_features.html @@ -96,7 +96,7 @@

                            calculate_conversation_level_features module

                            -class utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator(chat_data: DataFrame, user_data: DataFrame, conv_data: DataFrame, vect_data: DataFrame, vector_directory: str, conversation_id_col: str, speaker_id_col: str, message_col: str, timestamp_col: str, input_columns: list)
                            +class utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator(chat_data: DataFrame, user_data: DataFrame, conv_data: DataFrame, vect_data: DataFrame, vector_directory: str, conversation_id_col: str, speaker_id_col: str, message_col: str, timestamp_col: str, convo_aggregation: bool, convo_methods: list, convo_columns: list, user_aggregation: bool, user_methods: list, user_columns: list, chat_features: list)

                            Bases: object

                            Initialize variables and objects used by the ConversationLevelFeaturesCalculator class.

                            This class uses various feature modules to define conversation-level features. It reads input data and @@ -109,7 +109,13 @@

                          • conv_data (pd.DataFrame) – Pandas dataframe of conversation-level features derived from the chat-level dataframe

                          • vect_data (pd.DataFrame) – Pandas dataframe of processed vectors derived from the chat-level dataframe

                          • vector_directory (str) – Directory where vector files are stored

                          • -
                          • input_columns (list) – List of columns in the chat-level features dataframe that should not be summarized

                          • +
                          • convo_aggregation (bool) – If true, will aggregate features at the conversational level

                          • +
                          • convo_methods (list) – Specifies which functions users want to aggregate with (e.g., mean, stdev…)

                          • +
                          • convo_columns (list) – Specifies which columns (at the chat level) users want aggregated

                          • +
                          • user_aggregation – If true, will aggregate features at the user level

                          • +
                          • user_methods (list) – Specifies which functions users want to aggregate with (e.g., mean, stdev…) at the user level

                          • +
                          • user_columns (list) – Specifies which columns (at the chat level) users want aggregated for the user level

                          • +
                          • chat_features (list) – Tracks all the chat-level features generated by the toolkit

                          @@ -120,11 +126,14 @@

                          This function computes various conversation-level features by aggregating chat-level and user-level features, and appends them as new columns to the input conversation-level data.

                          -
                          Returns:
                          -

                          The conversation-level dataset with new columns for each conversation-level feature

                          +
                          Parameters:
                          +

                          feature_methods – The list of methods to use to generate features

                          -
                          Return type:
                          -

                          pd.DataFrame

                          +
                          Returns:
                          +

                          The conversation-level dataset with new columns for each conversation-level feature

                          +
                          +
                          Return type:
                          +

                          pd.DataFrame

                          @@ -206,8 +215,7 @@

                          This function computes the Gini index for features involving counts, such as: - Word count - Character count -- Message count -- Function word accommodation

                          +- Message count

                          The Gini index is then merged into the conversation-level data.

                          Returns:
                          diff --git a/docs/build/html/utils/calculate_user_level_features.html b/docs/build/html/utils/calculate_user_level_features.html index 1577598c..a8a17750 100644 --- a/docs/build/html/utils/calculate_user_level_features.html +++ b/docs/build/html/utils/calculate_user_level_features.html @@ -96,7 +96,7 @@

                          calculate_user_level_features module

                          -class utils.calculate_user_level_features.UserLevelFeaturesCalculator(chat_data: DataFrame, user_data: DataFrame, vect_data: DataFrame, conversation_id_col: str, speaker_id_col: str, input_columns: list)
                          +class utils.calculate_user_level_features.UserLevelFeaturesCalculator(chat_data: DataFrame, user_data: DataFrame, vect_data: DataFrame, conversation_id_col: str, speaker_id_col: str, user_aggregation: bool, user_methods: list, user_columns: list, chat_features: list)

                          Bases: object

                          Initialize variables and objects used by the UserLevelFeaturesCalculator class.

                          This class uses various feature modules to define user- (speaker) level features. It reads input data and @@ -109,7 +109,10 @@

                        • vect_data (pd.DataFrame) – Pandas dataframe of message embeddings corresponding to each instance of the chat data

                        • conversation_id_col (str) – A string representing the column name that should be selected as the conversation ID. Defaults to “conversation_num”.

                        • speaker_id_col (str) – A string representing the column name that should be selected as the speaker ID. Defaults to “speaker_nickname”.

                        • -
                        • input_columns (list) – List of columns in the chat-level features dataframe that should not be summarized

                        • +
                        • user_aggregation (bool) – If true, will aggregate features at the user level

                        • +
                        • user_methods (list) – Specifies which functions users want to aggregate with (e.g., mean, stdev…) at the user level

                        • +
                        • user_columns (list) – Specifies which columns (at the chat level) users want aggregated for the user level

                        • +
                        • chat_features (list) – Tracks all the chat-level features generated by the toolkit

                        @@ -144,21 +147,6 @@ -
                        -
                        -get_user_level_averaged_features() None
                        -

                        Aggregate summary statistics by calculating average user-level features from chat-level features.

                        -

                        This function calculates and merges the average features into the user-level data.

                        -
                        -
                        Returns:
                        -

                        None

                        -
                        -
                        Return type:
                        -

                        None

                        -
                        -
                        -
                        -
                        get_user_level_summary_statistics_features() None
                        @@ -179,8 +167,7 @@

                        Features for which summing makes sense include: - Word count (total number of words) - Character count -- Message count -- Function word accommodation

                        +- Message count

                        This function calculates and merges the summed features into the user-level data.

                        Returns:
                        diff --git a/docs/build/html/utils/summarize_features.html b/docs/build/html/utils/summarize_features.html index e4d8f4dd..11d56eec 100644 --- a/docs/build/html/utils/summarize_features.html +++ b/docs/build/html/utils/summarize_features.html @@ -58,12 +58,17 @@
                      • preload_word_lists module
                      • preprocess module
                      • summarize_features module
                      • @@ -105,11 +110,35 @@

                        summarize_features module

                        -
                        -utils.summarize_features.get_average(input_data, column_to_summarize, new_column_name, conversation_id_col)
                        -

                        Generate a summary DataFrame with the average of a specified column per conversation.

                        -

                        This function calculates the average of a specified column for each conversation in the input data, -and returns a DataFrame containing the conversation number and the calculated average.

                        +
                        +utils.summarize_features.get_max(input_data, column_to_summarize, new_column_name, conversation_id_col)
                        +

                        Generate a summary DataFrame with the maximum value of a specified column per conversation.

                        +

                        This function calculates the maximum value of a specified column for each conversation in the input data, +and returns a DataFrame containing the conversation number and the calculated maximum value.

                        +
                        +
                        Parameters:
                        +
                          +
                        • input_data (pandas.DataFrame) – The DataFrame containing data at the chat or user level.

                        • +
                        • column_to_summarize (str) – The name of the column to be aggregated for maximum value.

                        • +
                        • new_column_name (str) – The desired name for the new summary column.

                        • +
                        • conversation_id_col (str) – A string representing the column name that should be selected as the conversation ID.

                        • +
                        +
                        +
                        Returns:
                        +

                        A DataFrame with the conversation number and the maximum value of the specified column.

                        +
                        +
                        Return type:
                        +

                        pandas.DataFrame

                        +
                        +
                        +
                        + +
                        +
                        +utils.summarize_features.get_mean(input_data, column_to_summarize, new_column_name, conversation_id_col)
                        +

                        Generate a summary DataFrame with the mean of a specified column per conversation.

                        +

                        This function calculates the mean of a specified column for each conversation in the input data, +and returns a DataFrame containing the conversation number and the calculated mean.

                        Parameters:
                          @@ -120,7 +149,7 @@
                        Returns:
                        -

                        A DataFrame with the conversation number and the average of the specified column.

                        +

                        A DataFrame with the conversation number and the mean of the specified column.

                        Return type:

                        pandas.DataFrame

                        @@ -129,22 +158,22 @@
                        -
                        -utils.summarize_features.get_max(input_data, column_to_summarize, new_column_name, conversation_id_col)
                        -

                        Generate a summary DataFrame with the maximum value of a specified column per conversation.

                        -

                        This function calculates the maximum value of a specified column for each conversation in the input data, -and returns a DataFrame containing the conversation number and the calculated maximum value.

                        +
                        +utils.summarize_features.get_median(input_data, column_to_summarize, new_column_name, conversation_id_col)
                        +

                        Generate a summary DataFrame with the median of a specified column per conversation.

                        +

                        This function calculates the median of a specified column for each conversation in the input data, +and returns a DataFrame containing the conversation number and the calculated median.

                        Parameters:
                        • input_data (pandas.DataFrame) – The DataFrame containing data at the chat or user level.

                        • -
                        • column_to_summarize (str) – The name of the column to be aggregated for maximum value.

                        • +
                        • column_to_summarize (str) – The name of the column to be aggregated for median.

                        • new_column_name (str) – The desired name for the new summary column.

                        • conversation_id_col (str) – A string representing the column name that should be selected as the conversation ID.

                        Returns:
                        -

                        A DataFrame with the conversation number and the maximum value of the specified column.

                        +

                        A DataFrame with the conversation number and the median of the specified column.

                        Return type:

                        pandas.DataFrame

                        @@ -225,22 +254,118 @@
                        -
                        -utils.summarize_features.get_user_average_dataframe(chat_level_data, on_column, conversation_id_col, speaker_id_col)
                        +
                        +utils.summarize_features.get_user_max_dataframe(chat_level_data, on_column, conversation_id_col, speaker_id_col)
                        +

                        Generate a user-level summary DataFrame by maximizing a specified column per individual.

                        +

                        This function groups chat-level data by user and conversation, calculates the max values +of a specified numeric column for each user, and returns the resulting DataFrame.

                        +
                        +
                        Parameters:
                        +
                          +
                        • chat_level_data (pandas.DataFrame) – The DataFrame in which each row represents a single chat.

                        • +
                        • on_column (str) – The name of the numeric column to max for each user.

                        • +
                        • conversation_id_col (str) – A string representing the column name that should be selected as the conversation ID.

                        • +
                        • speaker_id (str) – The column name representing the user identifier.

                        • +
                        +
                        +
                        Returns:
                        +

                        A grouped DataFrame with the max of the specified column per individual.

                        +
                        +
                        Return type:
                        +

                        pandas.DataFrame

                        +
                        +
                        +
                        + +
                        +
                        +utils.summarize_features.get_user_mean_dataframe(chat_level_data, on_column, conversation_id_col, speaker_id_col)

                        Generate a user-level summary DataFrame by averaging a specified column per individual.

                        -

                        This function groups chat-level data by user and conversation, calculates the average values +

                        This function groups chat-level data by user and conversation, calculates the mean values +of a specified numeric column for each user, and returns the resulting DataFrame.

                        +
                        +
                        Parameters:
                        +
                          +
                        • chat_level_data (pandas.DataFrame) – The DataFrame in which each row represents a single chat.

                        • +
                        • on_column (str) – The name of the numeric column to mean for each user.

                        • +
                        • conversation_id_col (str) – A string representing the column name that should be selected as the conversation ID.

                        • +
                        • speaker_id (str) – The column name representing the user identifier.

                        • +
                        +
                        +
                        Returns:
                        +

                        A grouped DataFrame with the mean of the specified column per individual.

                        +
                        +
                        Return type:
                        +

                        pandas.DataFrame

                        +
                        +
                        +
                        + +
                        +
                        +utils.summarize_features.get_user_median_dataframe(chat_level_data, on_column, conversation_id_col, speaker_id_col)
                        +

                        Generate a user-level summary DataFrame with the median of a specified column per individual.

                        +

                        This function groups chat-level data by user and conversation, calculates the median values +of a specified numeric column for each user, and returns the resulting DataFrame.

                        +
                        +
                        Parameters:
                        +
                          +
                        • chat_level_data (pandas.DataFrame) – The DataFrame in which each row represents a single chat.

                        • +
                        • on_column (str) – The name of the numeric column to median for each user.

                        • +
                        • conversation_id_col (str) – A string representing the column name that should be selected as the conversation ID.

                        • +
                        • speaker_id (str) – The column name representing the user identifier.

                        • +
                        +
                        +
                        Returns:
                        +

                        A grouped DataFrame with the median of the specified column per individual.

                        +
                        +
                        Return type:
                        +

                        pandas.DataFrame

                        +
                        +
                        +
                        + +
                        +
                        +utils.summarize_features.get_user_min_dataframe(chat_level_data, on_column, conversation_id_col, speaker_id_col)
                        +

                        Generate a user-level summary DataFrame by minmizing a specified column per individual.

                        +

                        This function groups chat-level data by user and conversation, calculates the min values +of a specified numeric column for each user, and returns the resulting DataFrame.

                        +
                        +
                        Parameters:
                        +
                          +
                        • chat_level_data (pandas.DataFrame) – The DataFrame in which each row represents a single chat.

                        • +
                        • on_column (str) – The name of the numeric column to max for each user.

                        • +
                        • conversation_id_col (str) – A string representing the column name that should be selected as the conversation ID.

                        • +
                        • speaker_id (str) – The column name representing the user identifier.

                        • +
                        +
                        +
                        Returns:
                        +

                        A grouped DataFrame with the min of the specified column per individual.

                        +
                        +
                        Return type:
                        +

                        pandas.DataFrame

                        +
                        +
                        +
                        + +
                        +
                        +utils.summarize_features.get_user_stdev_dataframe(chat_level_data, on_column, conversation_id_col, speaker_id_col)
                        +

                        Generate a user-level summary DataFrame with the standard deviation a specified column per individual.

                        +

                        This function groups chat-level data by user and conversation, calculates the standard deviation values of a specified numeric column for each user, and returns the resulting DataFrame.

                        Parameters:
                        • chat_level_data (pandas.DataFrame) – The DataFrame in which each row represents a single chat.

                        • -
                        • on_column (str) – The name of the numeric column to average for each user.

                        • +
                        • on_column (str) – The name of the numeric column to standard deviation for each user.

                        • conversation_id_col (str) – A string representing the column name that should be selected as the conversation ID.

                        • speaker_id (str) – The column name representing the user identifier.

                        Returns:
                        -

                        A grouped DataFrame with the average of the specified column per individual.

                        +

                        A grouped DataFrame with the standard deviation of the specified column per individual.

                        Return type:

                        pandas.DataFrame

                        diff --git a/docs/source/basics.rst b/docs/source/basics.rst index 1a071480..17531718 100644 --- a/docs/source/basics.rst +++ b/docs/source/basics.rst @@ -92,4 +92,22 @@ Here are some parameters that can be customized. For more details, refer to the 5. ``regenerate_vectors``: Force-regenerate vector data even if it already exists. -6. ``compute_vectors_from_preprocessed``: Computes vectors using preprocessed text (that is, with capitalization and punctuation removed). This was the default behavior for v.0.1.3 and earlier, but we now default to computing metrics on the unpreprocessed text (which INCLUDES capitalization and punctuation), and this parameter now defaults to False. \ No newline at end of file +6. ``compute_vectors_from_preprocessed``: Computes vectors using preprocessed text (that is, with capitalization and punctuation removed). This was the default behavior for v.0.1.3 and earlier, but we now default to computing metrics on the unpreprocessed text (which INCLUDES capitalization and punctuation), and this parameter now defaults to False. + +7. **Custom Aggregation of Utterance (Chat)-Level Attributes** (``convo_aggregation``, ``convo_methods``, ``convo_columns``, ``user_aggregation``, ``user_methods``, and ``user_columns``): Customize the ways in which attributes at a lower level of analysis (for example, the number of words in a given message) get aggregated to a higher level of analysis (for example, the total number of words in an entire conversation.) See the Worked Example (:ref:`custom_aggregation`) for details. + +Example Usage: + +.. code-block:: python + + convo_methods = ['max', 'median'] # This aggregates ONLY "positive_bert" at the conversation level using max and median. + convo_columns = ['positive_bert'], + user_methods = ['mean'] # This aggregates ONLY "negative_bert" at the speaker/user level using mean. + user_columns = ['negative_bert'] + +To turn off aggregation, set the following parameters to ``False``. By default, both are ``True`` as aggregation is performed automatically: + +.. code-block:: python + + convo_aggregation = False + user_aggregation = False \ No newline at end of file diff --git a/docs/source/examples.rst b/docs/source/examples.rst index 8d39f537..d3c0c97f 100644 --- a/docs/source/examples.rst +++ b/docs/source/examples.rst @@ -1,10 +1,12 @@ .. _examples: +================ Worked Example ================ +------------------- Demo / Sample Code -******************* +------------------- After following the "Getting Started" steps below, the Team Communication Toolkit can be imported at the top of any Python script. We have provided a simple example file, "featurize.py", and a demo notebook, "demo.ipynb," under our `examples folder `_ on GitHub. @@ -17,7 +19,7 @@ We also have demos available on Google Colab that you can copy and run on your o Finally, this page will walk you through a case study, highlighting top use cases and considerations when using the toolkit. Getting Started -**************** +================= To use our tool, please ensure that you have Python >= 3.10 installed and a working version of `pip `_, which is Python's package installer. Then, in your local environment, run the following: @@ -28,7 +30,7 @@ To use our tool, please ensure that you have Python >= 3.10 installed and a work This command will automatically install our package and all required dependencies. Troubleshooting -++++++++++++++++ +----------------- In the event that some dependency installations fail (for example, you may get an error that ``en_core_web_sm`` from Spacy is not found, or that there is a missing NLTK resource), please run this simple one-line command in your terminal, which will force the installation of Spacy and NLTK dependencies: @@ -41,14 +43,14 @@ If you encounter a further issue in which the 'wordnet' package from NLTK is not You can also find a full list of our requirements `here `_. Import Recommendations: Virtual Environment and Pip -+++++++++++++++++++++++++++++++++++++++++++++++++++++ +----------------------------------------------------- **We strongly recommend using a virtual environment in Python to run the package.** We have several specific dependency requirements. One important one is that we are currently only compatible with numpy < 2.0.0 because `numpy 2.0.0 and above `_ made significant changes that are not compatible with other dependencies of our package. As those dependencies are updated, we will support later versions of numpy. **We also strongly recommend that your version of pip is up-to-date (>=24.0).** There have been reports in which users have had trouble downloading dependencies (specifically, the Spacy package) with older versions of pip. If you get an error with downloading ``en_core_web_sm``, we recommend updating pip. Importing the Package -++++++++++++++++++++++ +----------------------- After you import the package and install dependencies, you can then use our tool in your Python script as follows: @@ -61,12 +63,12 @@ Now you have access to the :ref:`feature_builder`. This is the main class that y *Note*: PyPI treats hyphens and underscores equally, so "pip install team_comm_tools" and "pip install team-comm-tools" are equivalent. However, Python does NOT treat them equally, and **you should use underscores when you import the package, like this: from team_comm_tools import FeatureBuilder**. Walkthrough: Running the FeatureBuilder on Your Data -***************************************************** +======================================================= Next, we'll go through the details of running the FeatureBuilder on your data, discussing each of the specific options / parameters at your disposal. Configuring the FeatureBuilder -++++++++++++++++++++++++++++++++ +-------------------------------- The FeatureBuilder accepts any Pandas DataFrame as the input, so you can read in data in whatever format you like. For the purposes of this walkthrough, we'll be using some jury deliberation data from `Hu et al. (2021) `_. @@ -95,10 +97,10 @@ Now we are ready to call the FeatureBuilder on our data. All we need to do is de jury_feature_builder.featurize() Basic Input Columns -^^^^^^^^^^^^^^^^^^^^ +~~~~~~~~~~~~~~~~~~~~ Conversation Parameters -""""""""""""""""""""""""" +************************** * The **input_df** parameter is where you pass in your dataframe. In this case, we want to run the FeatureBuilder on the juries data that we read in! @@ -133,7 +135,7 @@ Conversation Parameters conversation_id_col = "batch_num" Vector Directory -"""""""""""""""""" +******************* * The **vector_directory** is the name of a directory in which we will store some pre-processed information. Some features require running inference from HuggingFace's `RoBERTa-based sentiment model `_, and others require generating `SBERT vectors `_. These processes take time, and we cache the outputs so that subsequent runs of the FeatureBuilder on the same dataset will not take as much time. Therefore, we require you to pass in a location where you'd like us to save these outputs. @@ -148,7 +150,7 @@ Vector Directory .. _output_file_details: Output File Naming Details -"""""""""""""""""""""""""""" +***************************** * There are three output files for each run of the FeatureBuilder, which mirror the three levels of analysis: utterance-, speaker-, and conversation-level. (Please see the section on `Generating Features: Utterance-, Speaker-, and Conversation-Level `_ for more details.) These are generated using the **output_file_base** parameter. @@ -188,7 +190,7 @@ Output File Naming Details Turns -"""""" +****** * The **turns** parameter controls whether we want to treat successive messages from the same person as a single turn. For example, in a text conversation, sometimes individuals will send many message in rapid succession, as follows: @@ -204,13 +206,20 @@ Turns Advanced Configuration Columns -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + More advanced users of the FeatureBuilder should consider the following optional parameters, depending on their needs. +Regenerating Vector Cache +*************************** + * The **regenerate_vectors** parameter controls whether you'd like the FeatureBuilder to re-generate the content in the **vector_directory**, even if we have already cached the output of a previous run. It is useful if the underlying data has changed, but you want to give the output file the same name as a previous run of the FeatureBuilder. * By default, **we assume that, if your output file is named the same, that the underlying vectors are the same**. If this isn't true, you should set **regenerate_vectors = True** in order to clear out the cache and re-generate the RoBERTa and SBERT outputs. +Custom Features +***************** + * The **custom_features** parameter allows you to specify features that do not exist within our default set. **We default to NOT generating four features that depend on SBERT vectors, as the process for generating the vectors tends to be slow.** However, these features can provide interesting insights into the extent to which individuals in a conversation speak "similarly" or not, based on a vector similarity metric. To access these features, simply use the **custom_features** parameter: .. code-block:: python @@ -224,6 +233,9 @@ More advanced users of the FeatureBuilder should consider the following optional * You can chose to add any of these features depending on your preference. +Analyzing First Percentage (%) +******************************** + * The **analyze_first_pct** parameter allows you to "cut off" and separately analyze the first X% of a conversation, in case you wish to separately study different sections of a conversation as it progresses. For example, you may be interested in knowing how the attributes of the first 50% of a conversation differ from the attributes of the entire conversation. Then you can sepcify the following: .. code-block:: python @@ -234,8 +246,105 @@ More advanced users of the FeatureBuilder should consider the following optional * By default, we will simply analyze 100% of each conversation. +Named Entity Recognition +************************** + * The parameters **ner_training_df** and **ner_cutoff** are required if you would like the FeatureBuilder to identify named entities in your conversations. For example, the sentence, "John, did you talk to Michael this morning?" has two named entities: "John" and "Michael." The FeatureBuilder includes a tool that automatically detects these named entities, but it requires the user (you!) to specify some training data with examples of the types of named entities you'd like to recognize. This is because proper nouns can take many forms, from standard Western-style names (e.g., "John") to pseudonymous online nicknames (like "littleHorse"). More information about these parameters can be found in :ref:`named_entity_recognition`. +.. _custom_aggregation: + +Custom Aggregation +******************** + +Imagine that you, as a researcher, are interested in high-level characteristics of the entire conversation (for example, how much is said), but you only have measures at the (lower) level of each individual utterance (for example, the number of words in each message). How would you "aggregate" information from the lower level to the higher level? + +A simple solution is to sum up to the total number of words per utterance, and group by the conversation identifier. Then, you would have the total number of words for the entire conversation. You can imagine doing similar aggregations for other types of statistics --- for example, the average number of words, the variance in the number of words, and so on. + +The FeautureBuilder includes built-in functionality to perform aggregations across different levels of analysis. By default, all numeric attributes generated at the utterance (chat) level are aggregated using the functions ``mean``, ``max``, ``min``, and ``stdev``. + +We perform three types of aggregations. Consider, for example, a conversation with messages containing 5, 10, and 15 words. Then we would have the following: + +- **Conversation-Level Aggregates** transform statistics at the level of an utterance (chat) to the level of a conversation. An example is the mean number of words per utterance (10) and the maximum number of words in any utterance (15). +- **Speaker(User)-Level Aggregates** transform statistics at the level of an utterance (chat) to the level of a given speaker (user; participant) in a conversation. An example is the mean number of words per message by a particular speaker. +- **Conversation-Level Aggregates of Speaker-Level Information**: transform information about the speakers (users; participants) to the level of a conversation. An example is the average number of words for the most talkative speaker. + +Given that there are multiple default aggregation functions and numerous utterance-level attributes, an (overwhelmingly) large number of aggregation statistics can be produced. As of **v.0.1.5**, aggregation behavior can be customized using the following parameters: + +- ``convo_aggregation``: A boolean that defaults to ``True``; when turned to ``False``, aggregation at the conversation level is disabled **[NOTE 1]**. +- ``convo_methods``: A list specifying which aggregation methods to use at the conversation level. Options include ``mean``, ``max``, ``min``, ``stdev``, ``median``, and ``sum`` **[NOTE 2]; [NOTE 3]**. We default to using ``mean``, ``max``, ``min``, and ``stdev``. +- ``convo_columns``: A list specifying which utterance-level attributes to aggregate to the conversation level. These should be valid columns in the utterance (chat)-level data. This defaults to ``None``, which is configured to aggregate all available numeric outputs. + +Equivalent parameters for the speaker (user) level are: + +- ``user_aggregation``: A boolean that defaults to ``True``; when turned to ``False``, aggregation at the speaker (user) level is disabled **[NOTE 1]**. +- ``user_methods``: A list specifying which aggregation methods to use at the speaker/user level (with the same options as the conversation level). +- ``user_columns``: A list specifying which utterance-level attributes to aggregate at the speaker/user level. + +The table below summarizes the different types of aggregation, and the ways in which they can be customized: + +.. list-table:: Aggregation Overview + :header-rows: 1 + :widths: 20 15 20 20 10 15 25 + + * - Aggregation Type + - Default Methods + - Methods Available + - Customization Parameters + - Output DataFrame + - Example Aggregation + - Interpretation + * - Utterance (Chat) -> Conversation + - ``mean``, ``max``, ``min``, ``stdev`` + - ``mean``, ``max``, ``min``, ``stdev``, ``median``, ``sum`` + - ``convo_aggregation``, ``convo_methods``, ``convo_columns`` + - Conversation + - ``mean_num_words`` + - Average number of words per utterance in the conversation + * - Utterance (Chat) -> Speaker/User + - ``mean``, ``max``, ``min``, ``stdev`` + - ``mean``, ``max``, ``min``, ``stdev``, ``median``, ``sum`` + - ``user_aggregation``, ``user_methods``, ``user_columns`` + - Speaker/User + - ``mean_num_words`` + - Average number of words per utterance for a given individual + * - Speaker (User) -> Conversation + - ``mean``, ``max``, ``min``, ``stdev`` + - ``mean``, ``max``, ``min``, ``stdev``, ``median``, ``sum`` + - ``convo_aggregation``, ``convo_methods``, ``convo_columns`` + - Conversation + - ``max_user_mean_num_words`` + - Average number of words per utterance for the person who talked the most + + +Example Usage of Custom Aggregation Parameters ++++++++++++++++++++++++++++++++++++++++++++++++ + +To customize aggregation behavior, simply add the following when constructing your FeatureBuilder: + +.. code-block:: python + + convo_methods = ['max', 'median'] # This aggregates ONLY "positive_bert" at the conversation level using max and median. + convo_columns = ['positive_bert'], + user_methods = ['mean'] # This aggregates ONLY "negative_bert" at the speaker/user level using mean. + user_columns = ['negative_bert'] + +To turn off aggregation, set the following parameters to ``False``. By default, both are ``True`` as aggregation is performed automatically: + +.. code-block:: python + + convo_aggregation = False + user_aggregation = False + +Important Notes and Caveats +++++++++++++++++++++++++++++ + +- **[NOTE 1]** Even when aggregation is disabled, totals of words, messages, and characters are still summarized, as these are required for calculating the Gini Coefficient features. +- **[NOTE 2]** Be careful when choosing the "sum" aggregation method, as it is not always appropriate to use the "sum" as an aggregation function. While it is a sensible choice for utterance-level attributes that are *countable* (for example, the total number of words, or other lexical wordcounts), it is a less sensible choice for others (for example, it does not make sense to sum sentiment scores for each utterance in a conversation). Consequently, using the "sum" feature will come with an associated warning. +- **[NOTE 3]** In addition to aggregating from the utterance (chat) level to the conversation level, we also aggregate from the speaker (user) level to the conversation level, using the same methods specified in ``convo_methods`` to do so. + +Cumulative Grouping +********************* + * The parameters **cumulative_grouping** and **within_task** address a special case of having multiple conversational identifiers; **they assume that the same team has multiple sequential conversations, and that, in each conversation, they perform one or more separate activities**. This was originally created as a companion to a multi-stage Empirica game (see: ``_). For example, imagine that a team must complete 3 different tasks, each with 3 different subparts. Then we can model this event in terms of 1 team (High level), 3 tasks (Mid level), and 3 subparts per task (Low level). * In such an activity, we assume that there are three levels of identifiers: High, Mid, and Low. @@ -297,7 +406,7 @@ More advanced users of the FeatureBuilder should consider the following optional * Finally, it is important to remember that, since cumulative groupings mean that we progressively consider more and more of the same conversation, **your conversation dataframe will substantially increase in size**, and this may affect the runtime of your FeatureBuilder. Additional FeatureBuilder Considerations -++++++++++++++++++++++++++++++++++++++++ +------------------------------------------ Here are some additional design details of the FeatureBuilder that you may wish to keep in mind: @@ -308,10 +417,11 @@ Here are some additional design details of the FeatureBuilder that you may wish * **When summarizing features from the utterance level to the conversation and speaker level, we only consider numeric features.** This is perhaps a simplifying assumption more than anything else; although we do extract non-numeric information (for example, a Dale-Chall label of whether an utterance is "Easy" to ready or not; a list of named entities identified), we cannot summarize these efficiently, so they are not considered. Inspecting Generated Features -++++++++++++++++++++++++++++++ +-------------------------------- Feature Information -^^^^^^^^^^^^^^^^^^^^^ +~~~~~~~~~~~~~~~~~~~~ + Every FeatureBuilder object has an underlying property called the **feature_dict**, which lists information and references about the features included in the toolkit. Assuming that **jury_feature_builder** is the name of your FeatureBuilder, you can access the feature dictionary as follows: .. code-block:: python @@ -350,7 +460,7 @@ Here is some example output (for the RoBERTa sentiment feature): 'bert_sentiment_data': True} Feature Column Names -^^^^^^^^^^^^^^^^^^^^^ +~~~~~~~~~~~~~~~~~~~~~~ Once you call **.featurize()**, you can also obtain a convenient list of the feature columns generated by the toolkit: diff --git a/docs/source/features/index.rst b/docs/source/features/index.rst index 1f67bc29..be1e826c 100644 --- a/docs/source/features/index.rst +++ b/docs/source/features/index.rst @@ -32,7 +32,11 @@ Utterance-Level features are calculated *first* in the Toolkit, as many conversa Conversation-Level Features **************************** -Once utterance-level features are computed, we compute conversation-level features; some of these features represent an aggregation of utterance-level information (for example, the "average level of positivity" in a conversation is simply the mean positivity score for each utterance). Other conversation-level features are constructs that are defined only at the conversation-level, such as the level of "burstiness" in a team's communication patterns. + +Base Conversation-Level Features ++++++++++++++++++++++++++++++++++++ + +The following features are constructs that are defined only at the conversation-level, such as the level of "burstiness" in a team's communication patterns. We call these the "base" conversation-level features, and they can be accessed using a property of the ``FeatureBuilder`` object: ``FeatureBuilder.conv_features_base``. .. toctree:: :maxdepth: 1 @@ -46,12 +50,17 @@ Once utterance-level features are computed, we compute conversation-level featur within_person_discursive_range turn_taking_features +Conversation-Level Aggregates ++++++++++++++++++++++++++++++++++++ +Once utterance-level features are computed, we compute conversation-level features; some of these features represent an aggregation of utterance-level information (for example, the "average level of positivity" in a conversation is simply the mean positivity score for each utterance). + +By default, all numeric attributes generated at the utterance (chat) level are aggregated using the functions ``mean``, ``max``, ``min``, and ``stdev``. However, this behavior can be customized, with details in the Worked Example (see :ref:`custom_aggregation`). + Speaker- (User) Level Features ********************************* User-level features generally represent an aggregation of features at the utterance- level (for example, the average number of words spoken *by a particular user*). There is therefore limited speaker-level feature documentation, other than a function used to compute the "network" of other speakers that an individual interacts with in a conversation. -You may reference the :ref:`Speaker (User)-Level Features Page ` for more information. - +You may reference the :ref:`Speaker (User)-Level Features Page ` for more information, as well as the details in the Worked Example (see :ref:`custom_aggregation`). .. toctree:: :maxdepth: 1 diff --git a/docs/source/features_conceptual/politeness_receptiveness_markers.rst b/docs/source/features_conceptual/politeness_receptiveness_markers.rst index 314478de..5d2fd30c 100644 --- a/docs/source/features_conceptual/politeness_receptiveness_markers.rst +++ b/docs/source/features_conceptual/politeness_receptiveness_markers.rst @@ -41,46 +41,46 @@ Interpreting the Feature The SECR module contains the following 39 features. -Impersonal_Pronoun -First_Person_Single -Hedges -Negation -Subjectivity -Negative_Emotion -Reasoning -Agreement -Second_Person -Adverb_Limiter -Disagreement -Acknowledgement -First_Person_Plural -For_Me -WH_Questions -YesNo_Questions -Bare_Command -Truth_Intensifier -Apology -Ask_Agency -By_The_Way -Can_You -Conjunction_Start -Could_You -Filler_Pause -For_You -Formal_Title -Give_Agency -Affirmation -Gratitude -Hello -Informal_Title -Let_Me_Know -Swearing -Reassurance -Please -Positive_Emotion -Goodbye -Token_count +- Impersonal_Pronoun +- First_Person_Single +- Hedges +- Negation +- Subjectivity +- Negative_Emotion +- Reasoning +- Agreement +- Second_Person +- Adverb_Limiter +- Disagreement +- Acknowledgement +- First_Person_Plural +- For_Me +- WH_Questions +- YesNo_Questions +- Bare_Command +- Truth_Intensifier +- Apology +- Ask_Agency +- By_The_Way +- Can_You +- Conjunction_Start +- Could_You +- Filler_Pause +- For_You +- Formal_Title +- Give_Agency +- Affirmation +- Gratitude +- Hello +- Informal_Title +- Let_Me_Know +- Swearing +- Reassurance +- Please +- Positive_Emotion +- Goodbye +- Token_count Related Features ***************** -Politness Strategies \ No newline at end of file +:ref:`politeness_strategies` contains a list of related conversational markers from an older paper (Danescu-Niculescu-Mizil et al., 2013). \ No newline at end of file diff --git a/docs/source/features_conceptual/politeness_strategies.rst b/docs/source/features_conceptual/politeness_strategies.rst index 61769217..9120825f 100644 --- a/docs/source/features_conceptual/politeness_strategies.rst +++ b/docs/source/features_conceptual/politeness_strategies.rst @@ -53,31 +53,32 @@ Interpreting the Feature List of politeness features returned by function (From cited papers): -======== ============================= ================ ===================================================== -Strategy Politeness In top quartile Example -======== ============================= ================ ===================================================== -1. Gratitude 0.87*** 78%*** I really appreciate that you’ve done them. -2. Deference 0.78*** 70%*** Nice work so far on your rewrite. -3. Greeting 0.43*** 45%*** Hey, I just tried to . . . -4. Positive lexicon 0.12*** 32%*** Wow! / This is a great way to deal. . . -5. Negative lexicon -0.13*** 22%** If you’re going to accuse me . . . -6. Apologizing 0.36*** 53%*** Sorry to bother you . . . -7. Please 0.49*** 57%*** Could you please say more. . . -8. Please start −0.30* 22% Please do not remove warnings . . . -9. Indirect (btw) 0.63*** 58%** By the way, where did you find . . . -10. Direct question −0.27*** 15%*** What is your native language? -11. Direct start −0.43*** 9%*** So can you retrieve it or not? -12. Counterfactual modal 0.47*** 52%*** Could/Would you . . . -13. Indicative modal 0.09 27% Can/Will you . . . -14. 1st person start 0.12*** 29%** I have just put the article . . . -15. 1st person pl. 0.08* 27% Could we find a less complex name . . . -16. 1st person 0.08*** 28%*** It is my view that ... -17. 2nd person 0.05*** 30%*** But what’s the good source you have in mind? -18. 2nd person start −0.30*** 17%** You’ve reverted yourself . . . -19. Hedges 0.14*** 28% I suggest we start with . . . -20. Factuality −0.38*** 13%*** In fact you did link, . . . -======== ============================= ================ ===================================================== +====== ============================== ===================== ================== ===================================================== + No. Strategy Politeness Score In top quartile Example + (Positive = More Polite) +====== ============================== ===================== ================== ===================================================== + 1. Gratitude 0.87*** 78%*** I really appreciate that you’ve done them. + 2. Deference 0.78*** 70%*** Nice work so far on your rewrite. + 3. Greeting 0.43*** 45%*** Hey, I just tried to . . . + 4. Positive lexicon 0.12*** 32%*** Wow! / This is a great way to deal. . . + 5. Negative lexicon -0.13*** 22%** If you’re going to accuse me . . . + 6. Apologizing 0.36*** 53%*** Sorry to bother you . . . + 7. Please 0.49*** 57%*** Could you please say more. . . + 8. Please start −0.30* 22% Please do not remove warnings . . . + 9. Indirect (btw) 0.63*** 58%** By the way, where did you find . . . + 10. Direct question −0.27*** 15%*** What is your native language? + 11. Direct start −0.43*** 9%*** So can you retrieve it or not? + 12. Counterfactual modal 0.47*** 52%*** Could/Would you . . . + 13. Indicative modal 0.09 27% Can/Will you . . . + 14. 1st person start 0.12*** 29%** I have just put the article . . . + 15. 1st person pl. 0.08* 27% Could we find a less complex name . . . + 16. 1st person 0.08*** 28%*** It is my view that ... + 17. 2nd person 0.05*** 30%*** But what’s the good source you have in mind? + 18. 2nd person start −0.30*** 17%** You’ve reverted yourself . . . + 19. Hedges 0.14*** 28% I suggest we start with . . . + 20. Factuality −0.38*** 13%*** In fact you did link, . . . +====== ============================== ===================== ================== ===================================================== Related Features ***************** -Politeness Receptiveness Markers \ No newline at end of file +:ref:`politeness_receptiveness_markers` contains a similar list of markers related to politeness and receptiveness, computed by the SECR module (Yeomans et al., 2020); this can be though of as a more recent and upgraded version of the original politeness features. \ No newline at end of file diff --git a/examples/featurize.py b/examples/featurize.py index e44a0e38..70be2752 100644 --- a/examples/featurize.py +++ b/examples/featurize.py @@ -18,7 +18,7 @@ juries_df = pd.read_csv("./example_data/full_empirical_datasets/jury_conversations_with_outcome_var.csv", encoding='utf-8') csop_df = pd.read_csv("./example_data/full_empirical_datasets/csop_conversations_withblanks.csv", encoding='utf-8') csopII_df = pd.read_csv("./example_data/full_empirical_datasets/csopII_conversations_withblanks.csv", encoding='utf-8') - + """ TINY / TEST DATASETS ------------------------------- @@ -38,6 +38,7 @@ """ # Tiny Juries + print("Tiny Juries Example...") tiny_juries_feature_builder = FeatureBuilder( input_df = tiny_juries_df, grouping_keys = ["batch_num", "round_num"], @@ -51,6 +52,25 @@ ) tiny_juries_feature_builder.featurize() + # Tiny Juries with custom aggregations + print("Tiny Juries with Custom Aggregation...") + tiny_juries_feature_builder_custom_agg = FeatureBuilder( + input_df = tiny_juries_df, + grouping_keys = ["batch_num", "round_num"], + output_file_base = "jury_TINY_output_custom_agg", # Naming output files using the output_file_base parameter (recommended) + turns = False, + custom_features = [ + "(BERT) Mimicry", + "Moving Mimicry", + "Forward Flow", + "Discursive Diversity"], + convo_methods = ['max', 'median'], # This will aggregate ONLY the "positive_bert" at the conversation level, using max and median. + convo_columns = ['positive_bert'], + user_methods = ['mean'], # This will aggregate ONLY "negative_bert" at the speaker/user level, using mean. + user_columns = ['negative_bert'], + ) + tiny_juries_feature_builder_custom_agg.featurize() + # Tiny multi-task tiny_multi_task_feature_builder = FeatureBuilder( input_df = tiny_multi_task_df, @@ -104,4 +124,4 @@ # output_file_path_conv_level = "./csopII_output_conversation_level.csv", # turns = True # ) - # csopII_feature_builder.featurize() + # csopII_feature_builder.featurize() \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 22eef440..d2c69323 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,8 @@ dependencies = [ "transformers==4.44.0", "tqdm>=4.66.5", "tzdata>=2023.3", - "tzlocal==5.2" + "tzlocal==5.2", + "fuzzywuzzy==0.18.0" ] authors = [ {name = "Xinlan Emily Hu", email = "xehu@wharton.upenn.edu"}, diff --git a/requirements.txt b/requirements.txt index 3c570b21..170e2674 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,3 +28,4 @@ transformers==4.44.0 tqdm>=4.66.5 tzdata>=2023.3 tzlocal==5.2 +fuzzywuzzy==0.18.0 \ No newline at end of file diff --git a/src/team_comm_tools/feature_builder.py b/src/team_comm_tools/feature_builder.py index b80d4837..88164851 100644 --- a/src/team_comm_tools/feature_builder.py +++ b/src/team_comm_tools/feature_builder.py @@ -21,83 +21,98 @@ from team_comm_tools.feature_dict import feature_dict class FeatureBuilder: - """The FeatureBuilder is the main engine that reads in the user's inputs and specifications and generates - conversational features. The FeatureBuilder separately calls the classes (the ChatLevelFeaturesCalculator, - ConversationLevelFeaturesCalculator, and UserLevelFeaturesCalculator) to generate conversational features at - different levels. + """ + The FeatureBuilder is the main engine that reads in the user's inputs and specifications and generates + conversational features. The FeatureBuilder separately calls the classes + (ChatLevelFeaturesCalculator, ConversationLevelFeaturesCalculator, and + UserLevelFeaturesCalculator) to generate conversational features at different levels. :param input_df: A pandas DataFrame containing the conversation data that you wish to featurize. :type input_df: pd.DataFrame - - :param vector_directory: Directory path where the vectors are to be cached. Defaults to "./vector_data/" + :param vector_directory: Directory path where the vectors are to be cached. Defaults to "./vector_data/". :type vector_directory: str - - :param output_file_base: Base name for the output files, which will be used to auto-generate filenames for each of the three levels. Defaults to "output." + :param output_file_base: Base name for the output files, used to auto-generate filenames for each + of the three levels. Defaults to "output." :type output_file_base: str - - :param output_file_path_chat_level: Path where the chat (utterance)-level output csv file is to be generated. (This parameter will override the base name.) + :param output_file_path_chat_level: Path where the chat (utterance)-level output csv file is + to be generated. This parameter will override the base name. :type output_file_path_chat_level: str - - :param output_file_path_user_level: Path where the user (speaker)-level output csv file is to be generated. (This parameter will override the base name.) + :param output_file_path_user_level: Path where the user (speaker)-level output csv file is + to be generated. This parameter will override the base name. :type output_file_path_user_level: str - - :param output_file_path_conv_level: Path where the conversation-level output csv file is to be generated. (This parameter will override the base name.) + :param output_file_path_conv_level: Path where the conversation-level output csv file is to be + generated. This parameter will override the base name. :type output_file_path_conv_level: str - - :param custom_features: A list of additional features outside of the default features that should be calculated. - Defaults to an empty list (i.e., no additional features beyond the defaults will be computed). + :param custom_features: A list of additional features outside of the default features that should + be calculated. Defaults to an empty list (i.e., no additional features beyond the defaults will + be computed). :type custom_features: list, optional - - :param analyze_first_pct: Analyze the first X% of the data. This parameter is useful because the earlier stages of the conversation may be more predictive than the later stages. Thus, researchers may wish to analyze only the first X% of the conversation data and compare the performance with using the full dataset. Defaults to [1.0]. + :param analyze_first_pct: Analyze the first X% of the data. This parameter is useful because the + earlier stages of the conversation may be more predictive than the later stages. Defaults to [1.0]. :type analyze_first_pct: list(float), optional - - :param turns: If true, collapses multiple "chats"/messages by the same speaker in a row into a single "turn." Defaults to False. + :param turns: If true, collapses multiple "chats"/messages by the same speaker in a row into a + single "turn." Defaults to False. :type turns: bool, optional - - :param conversation_id_col: A string representing the column name that should be selected as the conversation ID. Defaults to "conversation_num". + :param conversation_id_col: A string representing the column name that should be selected as + the conversation ID. Defaults to "conversation_num". :type conversation_id_col: str, optional - - :param speaker_id_col: A string representing the column name that should be selected as the speaker ID. Defaults to "speaker_nickname". + :param speaker_id_col: A string representing the column name that should be selected as the speaker ID. + Defaults to "speaker_nickname". :type speaker_id_col: str, optional - - :param message_col: A string representing the column name that should be selected as the message. Defaults to "message". + :param message_col: A string representing the column name that should be selected as the message. + Defaults to "message". :type message_col: str, optional - - :param timestamp_col: A string representing the column name that should be selected as the message. Defaults to "timestamp". + :param timestamp_col: A string representing the column name that should be selected as the message. + Defaults to "timestamp". :type timestamp_col: str, optional - - :param timestamp_unit: A string representing the unit of the timestamp (if the timestamp is numeric). Default to the unit 'ms' (milliseconds). Other options (D,s,ms,us,ns) can be found on the Pandas reference: https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html + :param timestamp_unit: A string representing the unit of the timestamp (if the timestamp is numeric). + Defaults to 'ms' (milliseconds). Other options (D, s, ms, us, ns) can be found on the Pandas + reference: https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html :type timestamp_unit: str, optional - - :param grouping_keys: A list of multiple identifiers that collectively identify a conversation. If non-empty, we will group by all of the keys in the list and use the grouped key as the unique "conversational identifier." - Defaults to an empty list. + :param grouping_keys: A list of multiple identifiers that collectively identify a conversation. If + non-empty, the data will be grouped by all keys in the list and use the grouped key as the unique + "conversational identifier." :type grouping_keys: list, optional - - :param cumulative_grouping: If true, uses a cumulative way of grouping chats (not just looking within a single ID, but also at what happened before.) - NOTE: This parameter and the following one (`within_grouping`) was created in the context of a multi-stage Empirica game (see: https://github.com/Watts-Lab/multi-task-empirica). - It assumes that there are exactly 3 nested columns at different levels: a High, Mid, and Low level; further, it assumes that these levels are temporally nested: that is, each - group/conversation has one High-level identifier, which contains one or more Mid-level identifiers, which contains one or more Low-level identifiers. - Defaults to False. + :param cumulative_grouping: If true, uses a cumulative way of grouping chats (looking not just within + a single ID, but also at what happened before). NOTE: This parameter and the following one + (`within_grouping`) were created in the context of a multi-stage Empirica game (see: + https://github.com/Watts-Lab/multi-task-empirica). Assumes exactly 3 nested columns at different + levels: a High, Mid, and Low level; that are temporally nested. Defaults to False. :type cumulative_grouping: bool, optional - - :param within_task: If true, groups cumulatively in such a way that we only look at prior chats that are of the same "task" (Mid-level identifier). Defaults to False. + :param within_task: If true, groups cumulatively such that only prior chats of the same "task" + (Mid-level identifier) are considered. Defaults to False. :type within_task: bool, optional - - :param ner_training_df: This is a pandas dataframe of training data for named entity recognition feature. Defaults to None, and will not generate named entity featuers if it does not exist. - :type ner_training_df: pd.DataFrame - - :param ner_cutoff: This is the cutoff value for the confidence of prediction for each named entity. Defaults to 0.9. + :param ner_training_df: A pandas DataFrame of training data for named entity recognition features. + Defaults to None and will not generate named entity features if it does not exist. + :type ner_training_df: pd.DataFrame, optional + :param ner_cutoff: The cutoff value for the confidence of prediction for each named entity. + Defaults to 0.9. :type ner_cutoff: int - - :param regenerate_vectors: If true, will regenerate vector data even if it already exists. Defaults to False. + :param regenerate_vectors: If true, regenerates vector data even if it already exists. Defaults to False. :type regenerate_vectors: bool, optional - - :param compute_vectors_from_preprocessed: If true, computes vectors using preprocessed text (that is, with capitalization and punctuation removed). This was the default behavior for v.0.1.3 and earlier, but we now default to computing metrics on the unpreprocessed text (which INCLUDES capitalization and punctuation). Defaults to False. + :param compute_vectors_from_preprocessed: If true, computes vectors using preprocessed text (with + capitalization and punctuation removed). Defaults to False. :type compute_vectors_from_preprocessed: bool, optional - - :return: The FeatureBuilder doesn't return anything; instead, it writes the generated features to files in the specified paths. It will also print out its progress, so you should see "All Done!" in the terminal, which will indicate that the features have been generated. + :param convo_aggregation: If true, aggregates features at the conversational level. Defaults to True. + :type convo_aggregation: bool, optional + :param convo_methods: Specifies which aggregation functions (e.g., mean, stdev) to use at the + conversational level. Defaults to ['mean', 'max', 'min', 'stdev']. + :type convo_methods: list, optional + :param convo_columns: Specifies which columns (at the utterance/chat level) to aggregate for the + conversational level. Defaults to all numeric columns. + :type convo_columns: list, optional + :param user_aggregation: If true, aggregates features at the speaker/user level. Defaults to True. + :type user_aggregation: bool, optional + :param user_methods: Specifies which functions to aggregate with (e.g., mean, stdev) at the user level. + Defaults to ['mean', 'max', 'min', 'stdev']. + :type user_methods: list, optional + :param user_columns: Specifies which columns (at the utterance/chat level) to aggregate for the + speaker/user level. Defaults to all numeric columns. + :type user_columns: list, optional + + :return: The FeatureBuilder writes the generated features to files in the specified paths. The progress + will be printed in the terminal, indicating completion with "All Done!". :rtype: None - """ def __init__( self, @@ -121,9 +136,21 @@ def __init__( ner_training_df: pd.DataFrame = None, ner_cutoff: int = 0.9, regenerate_vectors: bool = False, - compute_vectors_from_preprocessed: bool = False + compute_vectors_from_preprocessed: bool = False, + convo_aggregation = True, + convo_methods: list = ['mean', 'max', 'min', 'stdev'], + convo_columns: list = None, + user_aggregation = True, + user_methods: list = ['mean', 'max', 'min', 'stdev'], + user_columns: list = None ) -> None: + # Some error catching + if type(input_df) != pd.DataFrame: + raise ValueError("You must pass in a valid dataframe as the input_df!") + if not vector_directory: + raise ValueError("You must pass in a valid directory to cache vectors! For example: ./vector_data/") + # Defining input and output paths. self.chat_data = input_df.copy() self.orig_data = input_df.copy() @@ -229,6 +256,12 @@ def __init__( self.within_task = within_task self.ner_cutoff = ner_cutoff self.regenerate_vectors = regenerate_vectors + self.convo_aggregation = convo_aggregation + self.convo_methods = convo_methods + self.convo_columns = convo_columns + self.user_aggregation = user_aggregation + self.user_methods = user_methods + self.user_columns = user_columns if(compute_vectors_from_preprocessed == True): self.vector_colname = self.message_col # because the message col will eventually get preprocessed @@ -260,9 +293,6 @@ def __init__( warnings.warn("NOTE: User has requested cumulative grouping. Auto-generating the key `conversation_num` as the conversation identifier for cumulative conversations.") self.conversation_id_col = "conversation_num" - # Input columns are the columns that come in the raw chat data - self.input_columns = self.chat_data.columns - # Set all paths for vector retrieval (contingent on turns) df_type = "turns" if self.turns else "chats" if(self.cumulative_grouping): # create special vector paths for cumulative groupings @@ -363,7 +393,8 @@ def __init__( if not re.match(r"(.*\/|^)output\/", self.output_file_path_user_level): self.output_file_path_user_level = re.sub(r'/user/', r'/output/user/', self.output_file_path_user_level) - self.vect_path = vector_directory + "sentence/" + ("turns" if self.turns else "chats") + "/" + base_file_name + # Logic for processing vector cache + self.vect_path = vector_directory + "sentence/" + ("turns" if self.turns else "chats") + "/" + base_file_name self.bert_path = vector_directory + "sentiment/" + ("turns" if self.turns else "chats") + "/" + base_file_name # Check + generate embeddings @@ -492,7 +523,11 @@ def featurize(self) -> None: Path(self.output_file_path_user_level).parent.mkdir(parents=True, exist_ok=True) Path(self.output_file_path_chat_level).parent.mkdir(parents=True, exist_ok=True) Path(self.output_file_path_conv_level).parent.mkdir(parents=True, exist_ok=True) - + + # Store column names of what we generated, so that the user can easily access them + self.chat_features = list(itertools.chain(*[feature_dict[feature]["columns"] for feature in self.feature_names if feature_dict[feature]["level"] == "Chat"])) + self.conv_features_base = list(itertools.chain(*[feature_dict[feature]["columns"] for feature in self.feature_names if feature_dict[feature]["level"] == "Conversation"])) + # Step 3a. Create user level features. print("Generating User Level Features ...") self.user_level_features() @@ -502,14 +537,10 @@ def featurize(self) -> None: self.conv_level_features() self.merge_conv_data_with_original() - # Step 4. Write the feartures into the files defined in the output paths. + # Step 4. Write the features into the files defined in the output paths. + self.conv_features_all = [col for col in self.conv_data if col not in list(self.orig_data.columns) + ["conversation_num", self.message_col + "_original", "message_lower_with_punc"]] # save the column names that we generated! print("All Done!") - # Store column names of what we generated, so that the user can easily access them - self.chat_features = list(itertools.chain(*[feature_dict[feature]["columns"] for feature in self.feature_names if feature_dict[feature]["level"] == "Chat"])) - self.conv_features_base = list(itertools.chain(*[feature_dict[feature]["columns"] for feature in self.feature_names if feature_dict[feature]["level"] == "Conversation"])) - self.conv_features_all = [col for col in self.conv_data if col not in self.orig_data and col != 'conversation_num'] - self.save_features() def preprocess_chat_data(self) -> None: @@ -613,7 +644,10 @@ def user_level_features(self) -> None: vect_data= self.vect_data, conversation_id_col = self.conversation_id_col, speaker_id_col = self.speaker_id_col, - input_columns = self.input_columns + user_aggregation = self.user_aggregation, + user_methods = self.user_methods, + user_columns = self.user_columns, + chat_features = self.chat_features ) self.user_data = user_feature_builder.calculate_user_level_features() # Remove special characters in column names @@ -639,7 +673,13 @@ def conv_level_features(self) -> None: speaker_id_col = self.speaker_id_col, message_col = self.message_col, timestamp_col = self.timestamp_col, - input_columns = self.input_columns + convo_aggregation = self.convo_aggregation, + convo_methods = self.convo_methods, + convo_columns = self.convo_columns, + user_aggregation = self.user_aggregation, + user_methods = self.user_methods, + user_columns = self.user_columns, + chat_features = self.chat_features, ) # Calling the driver inside this class to create the features. self.conv_data = conv_feature_builder.calculate_conversation_level_features(self.feature_methods_conv) diff --git a/src/team_comm_tools/utils/calculate_conversation_level_features.py b/src/team_comm_tools/utils/calculate_conversation_level_features.py index b6a667e6..4e959559 100644 --- a/src/team_comm_tools/utils/calculate_conversation_level_features.py +++ b/src/team_comm_tools/utils/calculate_conversation_level_features.py @@ -7,6 +7,7 @@ from team_comm_tools.utils.summarize_features import * from team_comm_tools.utils.gini_coefficient import * from team_comm_tools.utils.preprocess import * +from fuzzywuzzy import process class ConversationLevelFeaturesCalculator: """ @@ -25,8 +26,20 @@ class ConversationLevelFeaturesCalculator: :type vect_data: pd.DataFrame :param vector_directory: Directory where vector files are stored :type vector_directory: str - :param input_columns: List of columns in the chat-level features dataframe that should not be summarized - :type input_columns: list + :param convo_aggregation: If true, will aggregate features at the conversational level + :type convo_aggregation: bool + :param convo_methods: Specifies which functions users want to aggregate with (e.g., mean, stdev...) + :type convo_methods: list + :param convo_columns: Specifies which columns (at the chat level) users want aggregated + :type convo_columns: list + :param user_aggregation: If true, will aggregate features at the user level + :type convo_aggregation: bool + :param user_methods: Specifies which functions users want to aggregate with (e.g., mean, stdev...) at the user level + :type user_methods: list + :param user_columns: Specifies which columns (at the chat level) users want aggregated for the user level + :type user_columns: list + :param chat_features: Tracks all the chat-level features generated by the toolkit + :type chat_features: list """ def __init__(self, chat_data: pd.DataFrame, user_data: pd.DataFrame, @@ -37,8 +50,15 @@ def __init__(self, chat_data: pd.DataFrame, speaker_id_col: str, message_col: str, timestamp_col: str, - input_columns:list) -> None: - + convo_aggregation: bool, + convo_methods: list, + convo_columns: list, + user_aggregation: bool, + user_methods: list, + user_columns: list, + chat_features: list, + ) -> None: + # Initializing variables self.chat_data = chat_data self.user_data = user_data @@ -49,13 +69,155 @@ def __init__(self, chat_data: pd.DataFrame, self.speaker_id_col = speaker_id_col self.message_col = message_col self.timestamp_col = timestamp_col - # Denotes the columns that can be summarized from the chat level, onto the conversation level. - self.input_columns = list(input_columns) - if 'conversation_num' not in self.input_columns: - self.input_columns.append('conversation_num') - self.columns_to_summarize = [column for column in self.chat_data.columns \ - if (column not in self.input_columns) and pd.api.types.is_numeric_dtype(self.chat_data[column])] + self.convo_aggregation = convo_aggregation + self.convo_methods = convo_methods + self.user_aggregation = user_aggregation + self.user_methods = user_methods + self.user_columns = user_columns + self.chat_features = chat_features + + def clean_up_aggregation_method_names(aggregation_method_names:list, method_param:str) -> list: + """ + Clean up different ways of specifying the aggregation names; e.g., point "average" and "max" + to the same function + + :param aggregation_method_names: The list of method names requested by the user for aggregation + :type aggregation_method_names: list + + :param method_param: The name of the parameter the user specified their methods in (convo_columns or user_columns) + :type method_param: str + + :return: the list of valid methods that can be used for aggregation + :rtype: list + """ + + aggregation_method_names = aggregation_method_names.copy() + + for i in range(len(aggregation_method_names)): + # directly modify the list to replace synonyms + if aggregation_method_names[i] == "average": + aggregation_method_names[i] = "mean" + if aggregation_method_names[i] == "maximum": + aggregation_method_names[i] = "max" + if aggregation_method_names[i] == "minimum": + aggregation_method_names[i] = "min" + if aggregation_method_names[i] == "standard deviation": + aggregation_method_names[i] = "stdev" + if aggregation_method_names[i] == "sd": + aggregation_method_names[i] = "stdev" + if aggregation_method_names[i] == "std": + aggregation_method_names[i] = "stdev" + if aggregation_method_names[i] == "total": + aggregation_method_names[i] = "sum" + if aggregation_method_names[i] == "add": + aggregation_method_names[i] = "sum" + if aggregation_method_names[i] == "summing": + aggregation_method_names[i] = "sum" + + current = aggregation_method_names[i] + + if current != "mean" and current != "max" and current != "min" and current != "stdev" and current != "median" and current != "sum": + print(f"WARNING: {current} is not a valid user method (specified in {method_param}). Valid methods are: [mean, max, min, stdev, median, sum]. Ignoring...") + aggregation_method_names.remove(current) + + # print a WARNING for sum, since not all sums make sense; e.g., it makes sense to sum the total number of words, but not to sum the positivity scores + if current == "sum": + print(f"INFO: User requested 'sum' in {method_param}. Ensure summing is appropriate; it is helpful for countable metrics like word counts. For non-countable metrics, such as sentiment ratings, consider using the mean instead.") + + return aggregation_method_names + + def ensure_aggregation_columns_present(user_inputted_columns:list, agg_param:str) -> list: + + """ + An error checking function to ensure that the columns inputted by the user are present in the data. + + :param user_inputted_columns: The list of columns requested by the user for aggregation + :type user_inputted_columns: list + + :param agg_param: The name of the parameter the user specified for aggregation (convo_columns or user_columns) + :type agg_param: str + + :return: the list of valid columns that can be aggregated (they are present in the chat data AND generated by us) + :rtype: list + """ + columns_in_data = list(set(user_inputted_columns).intersection(set(self.chat_features).intersection(set(self.chat_data.columns)))) + if(len(columns_in_data) != len(user_inputted_columns)): + print( + f"WARNING: One or more columns requested for aggregation using the {agg_param} parameter are not valid. Ignoring..." + ) + # help the user fix their error + for i in user_inputted_columns: + matches = process.extract(i, self.chat_data.columns, limit=3) + best_match, similarity = matches[0] + + if similarity == 100: + continue + elif similarity >= 80: + print("Did you mean", best_match, "instead of", i, "?") + else: + print(i, "not found in data and no close match.") + + return columns_in_data + + # check if user inputted convo_columns is None + # If 'None', the default behavior is to summarize all numeric columns generated at the chat level + if convo_columns is None: + self.columns_to_summarize = [column for column in set(self.chat_features).intersection(set(self.chat_data.columns)) \ + if pd.api.types.is_numeric_dtype(self.chat_data[column])] + else: + if convo_aggregation == True and (len(convo_columns) == 0 or len(convo_methods) == 0): + print( + "WARNING: convo_aggregation is True but no convo_columns specified. Defaulting convo_aggregation to False." + ) + self.convo_aggregation = False + else: + # to check if columns are in data and in the list of features we generate + convo_columns_in_data = ensure_aggregation_columns_present(user_inputted_columns = convo_columns, agg_param = "convo_columns") + self.columns_to_summarize = convo_columns_in_data + + # ensure all lowercase + self.convo_methods = [col.lower() for col in self.convo_methods] + self.columns_to_summarize = [col.lower() for col in self.columns_to_summarize] + + # check if columns are numeric + for col in self.columns_to_summarize: + if pd.api.types.is_numeric_dtype(self.chat_data[col]) is False: + print("WARNING: ", col, " (in convo_columns) is not numeric. Ignoring...") + self.columns_to_summarize.remove(col) + + # check if user inputted user_columns is None + # as with the conversation level, we default to aggregating all generated chat-level features + if user_columns is None: + self.user_columns = [column for column in set(self.chat_features).intersection(set(self.chat_data.columns)) \ + if pd.api.types.is_numeric_dtype(self.chat_data[column])] + else: + if user_aggregation == True and len(user_columns) == 0: + print("WARNING: user_aggregation is True but no user_columns specified. Defaulting user_aggregation to False.") + self.user_aggregation = False + else: + # to check if columns are in data + user_columns_in_data = ensure_aggregation_columns_present(user_inputted_columns = user_columns, agg_param = "user_columns") + self.user_columns = user_columns_in_data + + # ensure all lowercase + self.user_methods = [col.lower() for col in self.user_methods] + self.user_columns = [col.lower() for col in self.user_columns] + + # check if columns are numeric + for col in self.user_columns: + if pd.api.types.is_numeric_dtype(self.chat_data[col]) is False: + print("WARNING: ", col, " (in user_columns) is not numeric. Ignoring...") + self.user_columns.remove(col) + + # replace interchangable words in convo_methods and remove invalid methods + self.convo_methods = clean_up_aggregation_method_names(aggregation_method_names = self.convo_methods, method_param = "convo_methods") + + # replace interchangable words in user_methods and remove invalid methods + self.user_methods = clean_up_aggregation_method_names(aggregation_method_names = self.user_methods, method_param = "user_methods") + + # columns that need to be summed due to dependency on gini coefficient self.summable_columns = ["num_words", "num_chars", "num_messages"] + def calculate_conversation_level_features(self, feature_methods: list) -> pd.DataFrame: """ @@ -64,6 +226,9 @@ def calculate_conversation_level_features(self, feature_methods: list) -> pd.Dat This function computes various conversation-level features by aggregating chat-level and user-level features, and appends them as new columns to the input conversation-level data. + :param feature_methods: The list of methods to use to generate features + :type turns: list + :return: The conversation-level dataset with new columns for each conversation-level feature :rtype: pd.DataFrame """ @@ -98,7 +263,6 @@ def get_gini_features(self) -> None: - Word count - Character count - Message count - - Function word accommodation The Gini index is then merged into the conversation-level data. @@ -106,7 +270,6 @@ def get_gini_features(self) -> None: :rtype: None """ for column in self.summable_columns: - self.conv_data = pd.merge( left=self.conv_data, right=get_gini(self.user_data.copy(), "sum_"+column, self.conversation_id_col), # this applies to the summed columns in user_data, which matches the above @@ -130,42 +293,66 @@ def get_conversation_level_aggregates(self) -> None: :rtype: None """ - # For each summarizable feature - for column in self.columns_to_summarize: - - # Average/Mean of feature across the Conversation - self.conv_data = pd.merge( - left=self.conv_data, - right=get_average(self.chat_data.copy(), column, 'average_'+column, self.conversation_id_col), - on=[self.conversation_id_col], - how="inner" - ) - - # Standard Deviation of feature across the Conversation - self.conv_data = pd.merge( - left=self.conv_data, - right=get_stdev(self.chat_data.copy(), column, 'stdev_'+column, self.conversation_id_col), - on=[self.conversation_id_col], - how="inner" - ) - - # Minima for the feature across the Conversation - self.conv_data = pd.merge( - left=self.conv_data, - right=get_min(self.chat_data.copy(), column, 'min_'+column, self.conversation_id_col), - on=[self.conversation_id_col], - how="inner" - ) - - # Maxima for the feature across the Conversation - self.conv_data = pd.merge( - left=self.conv_data, - right=get_max(self.chat_data.copy(), column, 'max_'+column, self.conversation_id_col), - on=[self.conversation_id_col], - how="inner" - ) - - # Do this only for the columns that make sense (e.g., countable things) + if self.convo_aggregation == True: + # For each summarizable feature + for column in self.columns_to_summarize: + + # Average/Mean of feature across the Conversation + if 'mean' in self.convo_methods: + self.conv_data = pd.merge( + left=self.conv_data, + right=get_mean(self.chat_data.copy(), column, 'mean_'+column, self.conversation_id_col), + on=[self.conversation_id_col], + how="inner" + ) + + # Standard Deviation of feature across the Conversation + if 'stdev' in self.convo_methods: + self.conv_data = pd.merge( + left=self.conv_data, + right=get_stdev(self.chat_data.copy(), column, 'stdev_'+column, self.conversation_id_col), + on=[self.conversation_id_col], + how="inner" + ) + + # Minima for the feature across the Conversation + if 'min' in self.convo_methods: + self.conv_data = pd.merge( + left=self.conv_data, + right=get_min(self.chat_data.copy(), column, 'min_'+column, self.conversation_id_col), + on=[self.conversation_id_col], + how="inner" + ) + + # Maxima for the feature across the Conversation + if 'max' in self.convo_methods: + self.conv_data = pd.merge( + left=self.conv_data, + right=get_max(self.chat_data.copy(), column, 'max_'+column, self.conversation_id_col), + on=[self.conversation_id_col], + how="inner" + ) + + # Median for the feature across the Conversation + if 'median' in self.convo_methods: + self.conv_data = pd.merge( + left=self.conv_data, + right=get_median(self.chat_data.copy(), column, 'median_'+column, self.conversation_id_col), + on=[self.conversation_id_col], + how="inner" + ) + + # Sum for the feature across the Conversation + if column not in self.summable_columns: # do this only for things we are not already auto-summarizing + if 'sum' in self.convo_methods: + self.conv_data = pd.merge( + left=self.conv_data, + right=get_sum(self.chat_data.copy(), column, 'sum_'+column, self.conversation_id_col), + on=[self.conversation_id_col], + how="inner" + ) + + # Compute some sums regardless of user specifications, as it's necessary for gini. for column in self.summable_columns: # Sum for the feature across the Conversation self.conv_data = pd.merge( @@ -174,7 +361,7 @@ def get_conversation_level_aggregates(self) -> None: on=[self.conversation_id_col], how="inner" ) - + def get_user_level_aggregates(self) -> None: """ Aggregate summary statistics from user-level features to conversation-level features. @@ -189,80 +376,79 @@ def get_user_level_aggregates(self) -> None: - Minimum of averaged user-level features - Maximum of averaged user-level features + :return: None :rtype: None """ - # Sum Columns were created using self.get_user_level_summed_features() - for column in self.columns_to_summarize: + if self.convo_aggregation == True and self.user_aggregation == True: - # Average/Mean of User-Level Feature - self.conv_data = pd.merge( - left=self.conv_data, - right=get_average(self.user_data.copy(), "sum_"+column, 'average_user_sum_'+column, self.conversation_id_col), - on=[self.conversation_id_col], - how="inner" - ) - - # Standard Deviation of User-Level Feature - self.conv_data = pd.merge( - left=self.conv_data, - right=get_stdev(self.user_data.copy(), "sum_"+column, 'stdev_user_sum_'+column, self.conversation_id_col), - on=[self.conversation_id_col], - how="inner" - ) - - # Minima of User-Level Feature - self.conv_data = pd.merge( - left=self.conv_data, - right=get_min(self.user_data.copy(), "sum_"+column, 'min_user_sum_'+column, self.conversation_id_col), - on=[self.conversation_id_col], - how="inner" - ) - - # Maxima of User-Level Feature - self.conv_data = pd.merge( - left=self.conv_data, - right=get_max(self.user_data.copy(), "sum_"+column, 'max_user_sum_'+column, self.conversation_id_col), - on=[self.conversation_id_col], - how="inner" - ) - - # Average Columns were created using self.get_user_level_averaged_features() - for column in self.columns_to_summarize: - - # Average/Mean of User-Level Feature - self.conv_data = pd.merge( - left=self.conv_data, - right=get_average(self.user_data.copy(), "average_"+column, 'average_user_avg_'+column, self.conversation_id_col), - on=[self.conversation_id_col], - how="inner" - ) - - # Standard Deviation of User-Level Feature - self.conv_data = pd.merge( - left=self.conv_data, - right=get_stdev(self.user_data.copy(), "average_"+column, 'stdev_user_avg_'+column, self.conversation_id_col), - on=[self.conversation_id_col], - how="inner" - ) - - # Minima of User-Level Feature - self.conv_data = pd.merge( - left=self.conv_data, - right=get_min(self.user_data.copy(), "average_"+column, 'min_user_avg_'+column, self.conversation_id_col), - on=[self.conversation_id_col], - how="inner" - ) - - # Maxima of User-Level Feature - self.conv_data = pd.merge( - left=self.conv_data, - right=get_max(self.user_data.copy(), "average_"+column, 'max_user_avg_'+column, self.conversation_id_col), - on=[self.conversation_id_col], - how="inner" - ) - + # aggregates from the user level based on conversation methods + if 'mean' in self.convo_methods: + for user_column in self.user_columns: + for user_method in self.user_methods: + # Average/Mean of User-Level Feature + self.conv_data = pd.merge( + left=self.conv_data, + right=get_mean(self.user_data.copy(), user_method + "_" +user_column, "mean_user_" + user_method + "_" +user_column, self.conversation_id_col), + on=[self.conversation_id_col], + how="inner" + ) + + if 'stdev' in self.convo_methods: + for user_column in self.user_columns: + for user_method in self.user_methods: + # Standard Deviation of User-Level Feature + self.conv_data = pd.merge( + left=self.conv_data, + right=get_stdev(self.user_data.copy(), user_method + "_" + user_column, 'stdev_user_' + user_method + "_" + user_column, self.conversation_id_col), + on=[self.conversation_id_col], + how="inner" + ) + + if 'min' in self.convo_methods: + for user_column in self.user_columns: + for user_method in self.user_methods: + # Minima of User-Level Feature + self.conv_data = pd.merge( + left=self.conv_data, + right=get_min(self.user_data.copy(), user_method + "_" + user_column, 'min_user_' + user_method + "_" + user_column, self.conversation_id_col), + on=[self.conversation_id_col], + how="inner" + ) + + if 'max' in self.convo_methods: + for user_column in self.user_columns: + for user_method in self.user_methods: + # Maxima of User-Level Feature + self.conv_data = pd.merge( + left=self.conv_data, + right=get_max(self.user_data.copy(), user_method + "_" + user_column, 'max_user_' + user_method + "_" + user_column, self.conversation_id_col), + on=[self.conversation_id_col], + how="inner" + ) + + if 'median' in self.convo_methods: + for user_column in self.user_columns: + for user_method in self.user_methods: + # Median of User-Level Feature + self.conv_data = pd.merge( + left=self.conv_data, + right=get_median(self.user_data.copy(), user_method + "_" + user_column, 'median_user_' + user_method + "_" + user_column, self.conversation_id_col), + on=[self.conversation_id_col], + how="inner" + ) + + if 'sum' in self.convo_methods: + for user_column in self.user_columns: + for user_method in self.user_methods: + # Sum of User-Level Feature + self.conv_data = pd.merge( + left=self.conv_data, + right=get_sum(self.user_data.copy(), user_method + "_" + user_column, 'sum_user_' + user_method + "_" + user_column, self.conversation_id_col), + on=[self.conversation_id_col], + how="inner" + ) def get_discursive_diversity_features(self) -> None: """ diff --git a/src/team_comm_tools/utils/calculate_user_level_features.py b/src/team_comm_tools/utils/calculate_user_level_features.py index 2d131f43..0d9043c9 100644 --- a/src/team_comm_tools/utils/calculate_user_level_features.py +++ b/src/team_comm_tools/utils/calculate_user_level_features.py @@ -1,7 +1,8 @@ # Importing modules from features -from team_comm_tools.utils.summarize_features import get_user_sum_dataframe, get_user_average_dataframe +from team_comm_tools.utils.summarize_features import get_user_sum_dataframe, get_user_mean_dataframe, get_user_max_dataframe, get_user_min_dataframe, get_user_stdev_dataframe, get_user_median_dataframe from team_comm_tools.features.get_user_network import * from team_comm_tools.features.user_centroids import * +from fuzzywuzzy import process class UserLevelFeaturesCalculator: """ @@ -20,10 +21,24 @@ class UserLevelFeaturesCalculator: :type conversation_id_col: str :param speaker_id_col: A string representing the column name that should be selected as the speaker ID. Defaults to "speaker_nickname". :type speaker_id_col: str - :param input_columns: List of columns in the chat-level features dataframe that should not be summarized - :type input_columns: list + :param user_aggregation: If true, will aggregate features at the user level + :type user_aggregation: bool + :param user_methods: Specifies which functions users want to aggregate with (e.g., mean, stdev...) at the user level + :type user_methods: list + :param user_columns: Specifies which columns (at the chat level) users want aggregated for the user level + :type user_columns: list + :param chat_features: Tracks all the chat-level features generated by the toolkit + :type chat_features: list """ - def __init__(self, chat_data: pd.DataFrame, user_data: pd.DataFrame, vect_data: pd.DataFrame, conversation_id_col: str, speaker_id_col: str, input_columns:list) -> None: + def __init__(self, chat_data: pd.DataFrame, + user_data: pd.DataFrame, + vect_data: pd.DataFrame, + conversation_id_col: str, + speaker_id_col: str, + user_aggregation: bool, + user_methods: list, + user_columns: list, + chat_features: list) -> None: # Initializing variables self.chat_data = chat_data @@ -31,12 +46,100 @@ def __init__(self, chat_data: pd.DataFrame, user_data: pd.DataFrame, vect_data: self.vect_data = vect_data self.conversation_id_col = conversation_id_col self.speaker_id_col = speaker_id_col - # Denotes the columns that can be summarized from the chat level, onto the conversation level. - self.input_columns = list(input_columns) - self.input_columns.append('conversation_num') - self.columns_to_summarize = [column for column in self.chat_data.columns \ - if (column not in self.input_columns) and pd.api.types.is_numeric_dtype(self.chat_data[column])] + self.user_aggregation = user_aggregation + self.user_methods = user_methods + self.chat_features = chat_features + def clean_up_aggregation_method_names(aggregation_method_names:list) -> list: + """ + Clean up different ways of specifying the aggregation names; e.g., point "average" and "max" + to the same function + + :param aggregation_method_names: The list of method names requested by the user for aggregation + :type aggregation_method_names: list + + :return: the list of valid methods that can be used for aggregation + :rtype: list + """ + + aggregation_method_names = aggregation_method_names.copy() + + for i in range(len(aggregation_method_names)): + # directly modify the list to replace synonyms + if aggregation_method_names[i] == "average": + aggregation_method_names[i] = "mean" + if aggregation_method_names[i] == "maximum": + aggregation_method_names[i] = "max" + if aggregation_method_names[i] == "minimum": + aggregation_method_names[i] = "min" + if aggregation_method_names[i] == "standard deviation": + aggregation_method_names[i] = "stdev" + if aggregation_method_names[i] == "sd": + aggregation_method_names[i] = "stdev" + if aggregation_method_names[i] == "std": + aggregation_method_names[i] = "stdev" + if aggregation_method_names[i] == "total": + aggregation_method_names[i] = "sum" + if aggregation_method_names[i] == "add": + aggregation_method_names[i] = "sum" + if aggregation_method_names[i] == "summing": + aggregation_method_names[i] = "sum" + + current = aggregation_method_names[i] + + # don't print warnings here, since we already print them in the conversation_level_features equivalent + if current != "mean" and current != "max" and current != "min" and current != "stdev" and current != "median" and current != "sum": + aggregation_method_names.remove(current) + + return aggregation_method_names + + def ensure_aggregation_columns_present(user_inputted_columns:list, agg_param:str) -> list: + + """ + An error checking function to ensure that the columns inputted by the user are present in the data. + + Does not print warnings, since equivalent warnings are already printed at the conversation level. + + :param user_inputted_columns: The list of columns requested by the user for aggregation + :type user_inputted_columns: list + + :param user_inputted_columns: The name of the parameter the user specified for aggregation (convo_columns or user_columns) + :type user_inputted_columns: str + + :return: the list of valid columns that can be aggregated (they are present in the chat data AND generated by us) + :rtype: list + """ + columns_in_data = list(set(user_inputted_columns).intersection(set(self.chat_features).intersection(set(self.chat_data.columns)))) + return columns_in_data + + # check if user inputted user_columns is None + # we default to aggregating all generated chat-level features + if user_columns is None: + self.columns_to_summarize = [column for column in set(self.chat_features).intersection(set(self.chat_data.columns)) \ + if pd.api.types.is_numeric_dtype(self.chat_data[column])] + else: + if user_aggregation == True and (len(user_columns) == 0 or len(user_methods) == 0): + self.user_aggregation = False + else: + # to check if columns are in the data + user_columns_in_data = ensure_aggregation_columns_present(user_inputted_columns = user_columns, agg_param = "user_columns") + self.columns_to_summarize = user_columns_in_data + + # ensure all lowercase + self.user_methods = [col.lower() for col in self.user_methods] + self.columns_to_summarize = [col.lower() for col in self.columns_to_summarize] + + # check if columns are numeric + for col in self.columns_to_summarize: + if pd.api.types.is_numeric_dtype(self.chat_data[col]) is False: + self.columns_to_summarize.remove(col) + + # replace interchangable words in user_methods and remove invalid methods + self.user_methods = clean_up_aggregation_method_names(aggregation_method_names = self.user_methods) + + # columns that need to be summed due to dependency on gini coefficient + self.summable_columns = ["num_words", "num_chars", "num_messages"] + def calculate_user_level_features(self) -> pd.DataFrame: """ Main driver function for creating user-level features. @@ -48,14 +151,14 @@ def calculate_user_level_features(self) -> pd.DataFrame: :rtype: pd.DataFrame """ - # Get average features for all features - self.get_user_level_averaged_features() - - # Get total counts for all features + # Get total counts for features that need to be summed, regardless of what the user specified self.get_user_level_summed_features() + # Get user summary statistics for all features (e.g. mean, min, max, stdev) + self.get_user_level_summary_statistics_features() + # Get 4 discursive features (discursive diversity, variance in DD, incongruent modulation, within-person discursive range) - # self.get_centroids() + self.get_centroids() # Get list of other users in a given conversation self.get_user_network() @@ -75,7 +178,66 @@ def get_user_level_summary_statistics_features(self) -> None: This is an open question, so we are putting a TODO here. """ - pass + + if self.user_aggregation == True: + + # For each summarizable feature + for column in self.columns_to_summarize: + + # Average/Mean of feature across the User + if 'mean' in self.user_methods: + self.user_data = pd.merge( + left=self.user_data, + right=get_user_mean_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col), + on=[self.conversation_id_col, self.speaker_id_col], + how="inner" + ) + + # Maxima for the feature across the User + if 'max' in self.user_methods: + self.user_data = pd.merge( + left=self.user_data, + right=get_user_max_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col), + on=[self.conversation_id_col, self.speaker_id_col], + how="inner" + ) + + # Minima for the feature across the User + if 'min' in self.user_methods: + self.user_data = pd.merge( + left=self.user_data, + right=get_user_min_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col), + on=[self.conversation_id_col, self.speaker_id_col], + how="inner" + ) + + # Standard Deviation of feature across the User + if 'stdev' in self.user_methods: + self.user_data = pd.merge( + left=self.user_data, + right=get_user_stdev_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col), + on=[self.conversation_id_col, self.speaker_id_col], + how="inner" + ) + + # Median of feature across the User + if 'median' in self.user_methods: + self.user_data = pd.merge( + left=self.user_data, + right=get_user_median_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col), + on=[self.conversation_id_col, self.speaker_id_col], + how="inner" + ) + + # Sum of feature across the User + if column not in self.summable_columns: # do this only for things we are not already auto-summarizing + if 'sum' in self.user_methods: + self.user_data = pd.merge( + left=self.user_data, + right=get_user_sum_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col), + on=[self.conversation_id_col, self.speaker_id_col], + how="inner" + ) def get_user_level_summed_features(self) -> None: """ @@ -85,15 +247,16 @@ def get_user_level_summed_features(self) -> None: - Word count (total number of words) - Character count - Message count - - Function word accommodation This function calculates and merges the summed features into the user-level data. :return: None :rtype: None """ + # For each summarizable feature - for column in self.columns_to_summarize: + for column in self.summable_columns: + # Sum of feature across the Conversation self.user_data = pd.merge( left=self.user_data, @@ -102,25 +265,6 @@ def get_user_level_summed_features(self) -> None: how="inner" ) - def get_user_level_averaged_features(self) -> None: - """ - Aggregate summary statistics by calculating average user-level features from chat-level features. - - This function calculates and merges the average features into the user-level data. - - :return: None - :rtype: None - """ - # For each summarizable feature - for column in self.columns_to_summarize: - # Average/Mean of feature across the Conversation - self.user_data = pd.merge( - left=self.user_data, - right=get_user_average_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col), - on=[self.conversation_id_col, self.speaker_id_col], - how="inner" - ) - def get_centroids(self) -> None: """ Calculate the centroid of each user's chats in a given conversation for future discursive metric calculations. @@ -130,7 +274,8 @@ def get_centroids(self) -> None: :return: None :rtype: None """ - self.user_data['mean_embedding'] = get_user_centroids(self.chat_data, self.vect_data, self.conversation_id_col, self.speaker_id_col) + if self.vect_data is not None: # only do this if we have vector data for each user + self.user_data['mean_embedding'] = get_user_centroids(self.chat_data, self.vect_data, self.conversation_id_col, self.speaker_id_col) def get_user_network(self) -> None: """ diff --git a/src/team_comm_tools/utils/summarize_features.py b/src/team_comm_tools/utils/summarize_features.py index c02c9100..4270c55e 100644 --- a/src/team_comm_tools/utils/summarize_features.py +++ b/src/team_comm_tools/utils/summarize_features.py @@ -31,35 +31,131 @@ def get_user_sum_dataframe(chat_level_data, on_column, conversation_id_col, spea return(grouped_conversation_data) -def get_user_average_dataframe(chat_level_data, on_column, conversation_id_col, speaker_id_col): +def get_user_mean_dataframe(chat_level_data, on_column, conversation_id_col, speaker_id_col): """Generate a user-level summary DataFrame by averaging a specified column per individual. - This function groups chat-level data by user and conversation, calculates the average values + This function groups chat-level data by user and conversation, calculates the mean values of a specified numeric column for each user, and returns the resulting DataFrame. :param chat_level_data: The DataFrame in which each row represents a single chat. :type chat_level_data: pandas.DataFrame - :param on_column: The name of the numeric column to average for each user. + :param on_column: The name of the numeric column to mean for each user. :type on_column: str :param conversation_id_col: A string representing the column name that should be selected as the conversation ID. :type conversation_id_col: str :param speaker_id: The column name representing the user identifier. :type speaker_id: str - :return: A grouped DataFrame with the average of the specified column per individual. + :return: A grouped DataFrame with the mean of the specified column per individual. :rtype: pandas.DataFrame """ grouped_conversation_data = chat_level_data[[conversation_id_col, speaker_id_col, on_column]].groupby([conversation_id_col, speaker_id_col]).mean().reset_index() - grouped_conversation_data = grouped_conversation_data.rename(columns = {on_column: "average_"+on_column}) # gets this dataframe: - # Batch# Round# Speaker Average Number of Words + grouped_conversation_data = grouped_conversation_data.rename(columns = {on_column: "mean_"+on_column}) # gets this dataframe: + # Batch# Round# Speaker Mean Number of Words # 0 1 Priya 100 # 0 1 Yuluan 90 return(grouped_conversation_data) -def get_average(input_data, column_to_summarize, new_column_name, conversation_id_col): - """Generate a summary DataFrame with the average of a specified column per conversation. +def get_user_max_dataframe(chat_level_data, on_column, conversation_id_col, speaker_id_col): + """Generate a user-level summary DataFrame by maximizing a specified column per individual. - This function calculates the average of a specified column for each conversation in the input data, - and returns a DataFrame containing the conversation number and the calculated average. + This function groups chat-level data by user and conversation, calculates the max values + of a specified numeric column for each user, and returns the resulting DataFrame. + + :param chat_level_data: The DataFrame in which each row represents a single chat. + :type chat_level_data: pandas.DataFrame + :param on_column: The name of the numeric column to max for each user. + :type on_column: str + :param conversation_id_col: A string representing the column name that should be selected as the conversation ID. + :type conversation_id_col: str + :param speaker_id: The column name representing the user identifier. + :type speaker_id: str + :return: A grouped DataFrame with the max of the specified column per individual. + :rtype: pandas.DataFrame + """ + grouped_conversation_data = chat_level_data[[conversation_id_col, speaker_id_col, on_column]].groupby([conversation_id_col, speaker_id_col]).max().reset_index() + grouped_conversation_data = grouped_conversation_data.rename(columns = {on_column: "max_"+on_column}) # gets this dataframe: + # Batch# Round# Speaker Max Number of Words + # 0 1 Priya 100 + # 0 1 Yuluan 90 + return(grouped_conversation_data) + +def get_user_min_dataframe(chat_level_data, on_column, conversation_id_col, speaker_id_col): + """Generate a user-level summary DataFrame by minmizing a specified column per individual. + + This function groups chat-level data by user and conversation, calculates the min values + of a specified numeric column for each user, and returns the resulting DataFrame. + + :param chat_level_data: The DataFrame in which each row represents a single chat. + :type chat_level_data: pandas.DataFrame + :param on_column: The name of the numeric column to max for each user. + :type on_column: str + :param conversation_id_col: A string representing the column name that should be selected as the conversation ID. + :type conversation_id_col: str + :param speaker_id: The column name representing the user identifier. + :type speaker_id: str + :return: A grouped DataFrame with the min of the specified column per individual. + :rtype: pandas.DataFrame + """ + grouped_conversation_data = chat_level_data[[conversation_id_col, speaker_id_col, on_column]].groupby([conversation_id_col, speaker_id_col]).min().reset_index() + grouped_conversation_data = grouped_conversation_data.rename(columns = {on_column: "min_"+on_column}) # gets this dataframe: + # Batch# Round# Speaker Min Number of Words + # 0 1 Priya 100 + # 0 1 Yuluan 90 + return(grouped_conversation_data) + +def get_user_stdev_dataframe(chat_level_data, on_column, conversation_id_col, speaker_id_col): + """Generate a user-level summary DataFrame with the standard deviation a specified column per individual. + + This function groups chat-level data by user and conversation, calculates the standard deviation values + of a specified numeric column for each user, and returns the resulting DataFrame. + + :param chat_level_data: The DataFrame in which each row represents a single chat. + :type chat_level_data: pandas.DataFrame + :param on_column: The name of the numeric column to standard deviation for each user. + :type on_column: str + :param conversation_id_col: A string representing the column name that should be selected as the conversation ID. + :type conversation_id_col: str + :param speaker_id: The column name representing the user identifier. + :type speaker_id: str + :return: A grouped DataFrame with the standard deviation of the specified column per individual. + :rtype: pandas.DataFrame + """ + grouped_conversation_data = chat_level_data[[conversation_id_col, speaker_id_col, on_column]].groupby([conversation_id_col, speaker_id_col]).std().reset_index() + grouped_conversation_data = grouped_conversation_data.rename(columns = {on_column: "stdev_"+on_column}) # gets this dataframe: + # Batch# Round# Speaker Standard Deviation of Words + # 0 1 Priya 100 + # 0 1 Yuluan 90 + return(grouped_conversation_data) + +def get_user_median_dataframe(chat_level_data, on_column, conversation_id_col, speaker_id_col): + """Generate a user-level summary DataFrame with the median of a specified column per individual. + + This function groups chat-level data by user and conversation, calculates the median values + of a specified numeric column for each user, and returns the resulting DataFrame. + + :param chat_level_data: The DataFrame in which each row represents a single chat. + :type chat_level_data: pandas.DataFrame + :param on_column: The name of the numeric column to median for each user. + :type on_column: str + :param conversation_id_col: A string representing the column name that should be selected as the conversation ID. + :type conversation_id_col: str + :param speaker_id: The column name representing the user identifier. + :type speaker_id: str + :return: A grouped DataFrame with the median of the specified column per individual. + :rtype: pandas.DataFrame + """ + grouped_conversation_data = chat_level_data[[conversation_id_col, speaker_id_col, on_column]].groupby([conversation_id_col, speaker_id_col]).median().reset_index() + grouped_conversation_data = grouped_conversation_data.rename(columns = {on_column: "median_"+on_column}) # gets this dataframe: + # Batch# Round# Speaker Median of Words + # 0 1 Priya 100 + # 0 1 Yuluan 90 + return(grouped_conversation_data) + +def get_mean(input_data, column_to_summarize, new_column_name, conversation_id_col): + """Generate a summary DataFrame with the mean of a specified column per conversation. + + This function calculates the mean of a specified column for each conversation in the input data, + and returns a DataFrame containing the conversation number and the calculated mean. :param input_data: The DataFrame containing data at the chat or user level. :type input_data: pandas.DataFrame @@ -69,7 +165,7 @@ def get_average(input_data, column_to_summarize, new_column_name, conversation_i :type new_column_name: str :param conversation_id_col: A string representing the column name that should be selected as the conversation ID. :type conversation_id_col: str - :return: A DataFrame with the conversation number and the average of the specified column. + :return: A DataFrame with the conversation number and the mean of the specified column. :rtype: pandas.DataFrame """ input_data[new_column_name] = input_data.groupby([conversation_id_col], sort=False)[column_to_summarize].transform(lambda x: np.mean(x)) @@ -135,6 +231,26 @@ def get_stdev(input_data, column_to_summarize, new_column_name, conversation_id_ input_data[new_column_name] = input_data.groupby([conversation_id_col], sort=False)[column_to_summarize].transform(lambda x: np.std(x)) return(input_data[[conversation_id_col, new_column_name]].drop_duplicates()) +def get_median(input_data, column_to_summarize, new_column_name, conversation_id_col): + """Generate a summary DataFrame with the median of a specified column per conversation. + + This function calculates the median of a specified column for each conversation in the input data, + and returns a DataFrame containing the conversation number and the calculated median. + + :param input_data: The DataFrame containing data at the chat or user level. + :type input_data: pandas.DataFrame + :param column_to_summarize: The name of the column to be aggregated for median. + :type column_to_summarize: str + :param new_column_name: The desired name for the new summary column. + :type new_column_name: str + :param conversation_id_col: A string representing the column name that should be selected as the conversation ID. + :type conversation_id_col: str + :return: A DataFrame with the conversation number and the median of the specified column. + :rtype: pandas.DataFrame + """ + input_data[new_column_name] = input_data.groupby([conversation_id_col], sort=False)[column_to_summarize].transform(lambda x: np.median(x)) + return(input_data[[conversation_id_col, new_column_name]].drop_duplicates()) + def get_sum(input_data, column_to_summarize, new_column_name, conversation_id_col): """Generate a summary DataFrame with the sum of a specified column per conversation. diff --git a/tests/data/cleaned_data/test_package_aggregation.csv b/tests/data/cleaned_data/test_package_aggregation.csv new file mode 100644 index 00000000..af59ca61 --- /dev/null +++ b/tests/data/cleaned_data/test_package_aggregation.csv @@ -0,0 +1,4 @@ +batch_num,round_num,speaker_hash,speaker_nickname,timestamp,message,majority_pct,num_flipped,flipped_pct,num_votes +0,0,5e7e1e0031f4e454e196c30b,niceRhino,2020-04-20T18:27:20.125Z,This is my message.,1,1,0.333333333,3 +0,0,5e31d6e4e31c5304c46f1413,culturedCow,2020-04-20T18:27:23.764Z,Hi!,1,1,0.333333333,3 +0,0,5e7e4f4c31f4e454e196c9c4,spryBison,2020-04-20T18:27:27.724Z,How are you?,1,1,0.333333333,3 diff --git a/tests/run_package_grouping_tests.py b/tests/run_package_grouping_tests.py index 6df66cc1..35d766fc 100644 --- a/tests/run_package_grouping_tests.py +++ b/tests/run_package_grouping_tests.py @@ -13,6 +13,7 @@ if __name__ == "__main__": tiny_multi_task_renamed_df = pd.read_csv("data/cleaned_data/multi_task_TINY_cols_renamed.csv", encoding='utf-8') + package_agg_df = pd.read_csv("data/cleaned_data/test_package_aggregation.csv", encoding='utf-8') """ Testing Package Task 1 @@ -190,3 +191,43 @@ ) test_vectors.featurize() + """ + Test correctness of the custom aggregation pipeline: + + - Aggregate with all the functions for conversation level: [mean, max, min, stdev, median, sum] + - Specify 'mean' as 'average' instead and ensure it shows up correctly + - Aggregate with "mean" for the user level + a fake method (e.g., "foo") + - Aggregate only "second_person_lexical_wordcount" at the conversation level + - Aggregate "positive_bert" at the user level + a fake column (e.g., "bar") + a non-numeric column (e.g., "dale_chall_classification") + """ + + print("Testing custom aggregation...") + custom_agg_fb = FeatureBuilder( + input_df = package_agg_df, + grouping_keys = ["batch_num", "round_num"], + vector_directory = "./vector_data/", + output_file_base = "custom_agg_test" , + convo_methods = ['average', 'max', 'min', 'stdev', 'median', 'sum'], + convo_columns = ['second_person_lexical_wordcount'], # testing functionality in case of typo + user_methods = ['mean', 'foo'], + user_columns = ['positive_bert', 'bar', 'dale_chall_classification'], # testing functionality in case of typo + ) + custom_agg_fb.featurize() + + + """ + Test aggregation piepline when we switch aggregation to false + + (We should only get the default num words, num chars, and num messages aggregated). + """ + + print("Testing aggregation turned off...") + custom_agg_fb_no_agg = FeatureBuilder( + input_df = package_agg_df, + grouping_keys = ["batch_num", "round_num"], + vector_directory = "./vector_data/", + output_file_base = "custom_agg_test_no_agg" , + convo_aggregation = False, + user_aggregation = False, + ) + custom_agg_fb_no_agg.featurize() diff --git a/tests/test_package.py b/tests/test_package.py index ac145b66..8e73f215 100644 --- a/tests/test_package.py +++ b/tests/test_package.py @@ -17,11 +17,20 @@ impropercase_chatdf = pd.read_csv("./output/chat/tiny_multi_task_improper_level_chat.csv") sentiment_output = pd.read_csv('./vector_data/sentiment/chats/test_vectors_chat.csv') sbert_output = pd.read_csv('./vector_data/sentence/chats/test_vectors_chat.csv') - +custom_agg_conv = pd.read_csv('./output/conv/custom_agg_test_conv_level.csv') +custom_agg_user = pd.read_csv('./output/user/custom_agg_test_user_level.csv') +custom_no_agg_conv = pd.read_csv('./output/conv/custom_agg_test_no_agg_conv_level.csv') +custom_no_agg_user = pd.read_csv('./output/user/custom_agg_test_no_agg_user_level.csv') # Import the Feature Dictionary from team_comm_tools.feature_dict import feature_dict +# get the base conversational features +conv_features_base = list(itertools.chain(*[feature_dict[feature]["columns"] for feature in feature_dict.keys() if feature_dict[feature]["level"] == "Conversation"])) + +# this is the dataframe used to get the package aggregations; required for comparison +package_agg_df = pd.read_csv("data/cleaned_data/test_package_aggregation.csv", encoding='utf-8') + def test_path_robustness(): # case 1 was specified without the necessary 'output/', 'chat/', and '.csv' in its path. Ensure it works! try: @@ -234,3 +243,69 @@ def test_empty_vectors_equal(): file.write(f"Empty message vectors / sentence scores are not equal.\n") raise + +def test_custom_aggregation(): + + conv_columns = custom_agg_conv.columns + user_columns = custom_agg_user.columns + + # aggregated columns: cols that are not in the OG data or in conv_features_base + conv_columns_agg = [col for col in conv_columns if col not in ["conversation_num", "message_original", "message_lower_with_punc"] + list(package_agg_df.columns) + conv_features_base] + user_columns_agg = [col for col in user_columns if col not in ["conversation_num", "user_list"] + list(package_agg_df.columns) + conv_features_base] + + # default aggregations (sum_num_words, sum_num_chars, and sum_num_messages) + default_aggs = ["sum_num_words", "sum_num_chars", "sum_num_messages"] + + # conv: for each of [mean, max, min, stdev, median, sum], we should get an aggregation for second_person_lexical_wordcount + agg_funcs = ["mean", "max", "min", "stdev", "median", "sum"] + agg_func_cols = [f"{agg}_second_person_lexical_wordcount" for agg in agg_funcs] + + # user: we should get mean_positive_bert + user_agg_col = ["mean_positive_bert"] + + # conv: for each of [mean, max, min, stdev, median, sum], we should get an aggregation of mean_user_positive_bert + conv_user_agg_func_cols = [f"{agg}_user_mean_positive_bert" for agg in agg_funcs] + + try: + # assert that we have all the aggregations we expect at the conv level + assert(len(set(conv_columns_agg).difference(set(default_aggs+agg_func_cols+conv_user_agg_func_cols)))==0) + + # assert that we have all the aggregations we expect at the user level + assert(len(set(user_columns_agg).difference(set(default_aggs+user_agg_col)))==0) + + except AssertionError: + with open('test.log', 'a') as file: + file.write("\n") + file.write("------TEST FAILED------\n") + file.write(f"Custom aggregated columns are not what we expect.\n") + + raise + +def test_custom_aggregation_turned_off(): + + # we should get only sum_num_words, sum_num_chars, and sum_num_messages in both user and conv + conv_columns = custom_no_agg_conv.columns + user_columns = custom_no_agg_user.columns + + # aggregated columns: cols that are not in the OG data or in conv_features_base + conv_columns_agg = [col for col in conv_columns if col not in ["conversation_num", "message_original", "message_lower_with_punc"] + list(package_agg_df.columns) + conv_features_base] + user_columns_agg = [col for col in user_columns if col not in ["conversation_num", "user_list"] + list(package_agg_df.columns) + conv_features_base] + + # default aggregations (sum_num_words, sum_num_chars, and sum_num_messages) + default_aggs = ["sum_num_words", "sum_num_chars", "sum_num_messages"] + + try: + # assert that we have ONLY the default_aggs at the conv level + assert(len(set(conv_columns_agg).difference(set(default_aggs)))==0) + + # assert that we have ONLY the default_aggs at the user level + assert(len(set(user_columns_agg).difference(set(default_aggs)))==0) + + except AssertionError: + with open('test.log', 'a') as file: + file.write("\n") + file.write("------TEST FAILED------\n") + file.write(f"Default aggregated columns are NOT the only aggregated columns present in the dataframe when aggregations are off.\n") + + raise + \ No newline at end of file