diff --git a/lang_chain.ipynb b/lang_chain.ipynb deleted file mode 100644 index 13b4d73..0000000 --- a/lang_chain.ipynb +++ /dev/null @@ -1,1443 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "amDVyQ07asm3" - }, - "outputs": [], - "source": [ - "!pip install dowhy" - ] - }, - { - "cell_type": "code", - "source": [ - "!pip install psmpy" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "S-RoIMVnd1Vx", - "outputId": "e00e312b-d7a1-4370-a2db-5376e4f64360" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Collecting psmpy\n", - " Downloading psmpy-0.3.13-py3-none-any.whl (13 kB)\n", - "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from psmpy) (3.7.1)\n", - "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from psmpy) (1.23.5)\n", - "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from psmpy) (1.5.3)\n", - "Requirement already satisfied: seaborn in /usr/local/lib/python3.10/dist-packages (from psmpy) (0.12.2)\n", - "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from psmpy) (1.2.2)\n", - "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from psmpy) (1.10.1)\n", - "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->psmpy) (1.1.0)\n", - "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->psmpy) (0.11.0)\n", - "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->psmpy) (4.42.0)\n", - "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->psmpy) (1.4.4)\n", - "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->psmpy) (23.1)\n", - "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->psmpy) (9.4.0)\n", - "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->psmpy) (3.1.1)\n", - "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->psmpy) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->psmpy) (2023.3)\n", - "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->psmpy) (1.3.2)\n", - "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->psmpy) (3.2.0)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->psmpy) (1.16.0)\n", - "Installing collected packages: psmpy\n", - "Successfully installed psmpy-0.3.13\n" - ] - } - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vSfUZvQqnbOG", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "67c0116c-b550-47ec-a8fb-bf5c09cf9c60" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "CPU times: user 938 ms, sys: 137 ms, total: 1.07 s\n", - "Wall time: 2.91 s\n" - ] - } - ], - "source": [ - "%%time\n", - "import pandas as pd\n", - "import numpy as np\n", - "# import dowhy\n", - "from sklearn.linear_model import LinearRegression\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "givW9Uvbt7HN" - }, - "outputs": [], - "source": [ - "data_df = pd.read_pickle('/content/drive/MyDrive/Computational Social Systems/Master Thesis/headline_causal.pkl')\n", - "# data_df.drop(columns=['headline', 'clicks', 'clarification'], inplace=True)\n", - "# data_df['clarification'] = data_df['clarification'].map({'yes': 1, 'no': 0})\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kEfLtzr36dXX", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 36 - }, - "outputId": "60425119-2e67-4a3d-c313-8a0b35da4290" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "'Let’s See … Hire Cops, Pay Teachers, Buy Books For Schools. Or Kill People. Hard Choice, Right?'" - ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - } - }, - "metadata": {}, - "execution_count": 11 - } - ], - "source": [ - "\n", - "\n", - "list(data_df.columns)\n", - "data_df.headline.to_numpy()[0]\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UpptTAlaGqlY" - }, - "source": [ - "index = (for all indices in embedding)\n", - "\n", - "    model = CTR ~ embedding[!index]\n", - "\n", - "    perPerson_perIndex_predicted_CTR = predict(model, for each person)\n", - "\n", - "    perPerson_group = embedding[index] >= median(embedding[index])\n", - "\n", - "\n", - "$R_i^2$ for each pair where one person is in each group, and predicted CTR is optimally close\n", - " what is the actual CTR difference?\n", - "\n", - "List of most predictive indices:\n", - "\n", - "2, 145, 12..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "jorDjF9zIEQg" - }, - "outputs": [], - "source": [ - "outcome_column = 'CTR'\n", - "predictor_columns = [col for col in data_df.columns if col != outcome_column]\n", - "\n", - "\n", - "for index_column in predictor_columns:\n", - "\n", - " predictors = [col for col in predictor_columns if col != index_column]\n", - "\n", - " model = LinearRegression()\n", - " model.fit(data_df[predictors], data_df[outcome_column])\n", - "\n", - " data_df['predicted_CTR_' + index_column] = model.predict(data_df[predictors])\n", - "\n", - " median_embedding = data_df[index_column].median()\n", - " data_df['perPerson_group_' + index_column] = data_df[index_column] >= median_embedding\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "KYoX2laPYStO" - }, - "outputs": [], - "source": [ - "list(data_df.columns)\n", - "# data_new_df = pd.read_pickle('hello.pkl')\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2rU3mHa66Y8T" - }, - "outputs": [], - "source": [ - "\n", - "\n", - "# Step 3: Define a causal model\n", - "model = dowhy.CausalModel(\n", - " data=data_df,\n", - " treatment='formality_formal', # Replace with the name of your treatment variable column\n", - " outcome='CTR', # Replace with the name of your outcome variable column\n", - " common_causes=[\n", - " 'emotion_anger',\n", - " 'emotion_disgust',\n", - " 'emotion_fear',\n", - " 'emotion_joy',\n", - " 'emotion_neutral',\n", - " 'emotion_sadness',\n", - " 'emotion_surprise',\n", - " 'syntax_complexity',\n", - " 'basic_length',\n", - " 'giberish',\n", - " 'dale_chall_score',\n", - " 'sentiment_negative',\n", - " 'sentiment_neutral',\n", - " 'sentiment_positive'] # List other confounding variables\n", - ")\n", - "\n", - "# Step 4: Identify causal estimands\n", - "identified_estimand = model.identify_effect()\n", - "\n", - "# Step 5: Estimate causal effect using linear regression\n", - "estimate = model.estimate_effect(identified_estimand,\n", - " method_name='backdoor.linear_regression')\n", - "\n", - "# Print the causal effect estimate\n", - "print(estimate)\n", - "\n", - "\n", - "model.view_model()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Uf3JhhmXpPpv" - }, - "outputs": [], - "source": [ - "list(data.columns)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 256 - }, - "id": "LVBAfrEJU0pQ", - "outputId": "a96e5e5a-83d9-4db0-a00e-f8a6d7ddfe50" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " CTR formality_formal formality_informal emotion_anger \\\n", - "0 0.002566 0.998468 0.001532 0.409045 \n", - "1 0.028341 0.019113 0.980887 0.031717 \n", - "2 0.006298 0.035468 0.964532 0.342199 \n", - "3 0.008742 0.998651 0.001349 0.204931 \n", - "4 0.003279 0.964811 0.035189 0.316080 \n", - "\n", - " emotion_disgust emotion_fear emotion_joy emotion_neutral \\\n", - "0 0.221184 0.063077 0.006619 0.250574 \n", - "1 0.085310 0.030770 0.007533 0.720395 \n", - "2 0.382035 0.093000 0.002185 0.131670 \n", - "3 0.533624 0.067115 0.001463 0.108103 \n", - "4 0.077622 0.040388 0.005667 0.213170 \n", - "\n", - " emotion_sadness emotion_surprise ... predicted_CTR_embedding_380 \\\n", - "0 0.032195 0.017306 ... 0.014411 \n", - "1 0.076398 0.047876 ... 0.021069 \n", - "2 0.040235 0.008678 ... 0.015430 \n", - "3 0.064993 0.019770 ... 0.012817 \n", - "4 0.339549 0.007523 ... 0.013880 \n", - "\n", - " perPerson_group_embedding_380 predicted_CTR_embedding_381 \\\n", - "0 False 0.014180 \n", - "1 True 0.021140 \n", - "2 False 0.015175 \n", - "3 False 0.012809 \n", - "4 False 0.013911 \n", - "\n", - " perPerson_group_embedding_381 predicted_CTR_embedding_382 \\\n", - "0 True 0.014519 \n", - "1 True 0.021144 \n", - "2 False 0.015451 \n", - "3 False 0.012844 \n", - "4 False 0.014059 \n", - "\n", - " perPerson_group_embedding_382 predicted_CTR_embedding_383 \\\n", - "0 False 0.014455 \n", - "1 False 0.021086 \n", - "2 False 0.015228 \n", - "3 True 0.012904 \n", - "4 False 0.013964 \n", - "\n", - " perPerson_group_embedding_383 predicted_CTR_embedding_384 \\\n", - "0 True 0.014379 \n", - "1 False 0.021089 \n", - "2 True 0.015363 \n", - "3 True 0.012787 \n", - "4 True 0.013878 \n", - "\n", - " perPerson_group_embedding_384 \n", - "0 True \n", - "1 False \n", - "2 False \n", - "3 False \n", - "4 False \n", - "\n", - "[5 rows x 1201 columns]" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
CTRformality_formalformality_informalemotion_angeremotion_disgustemotion_fearemotion_joyemotion_neutralemotion_sadnessemotion_surprise...predicted_CTR_embedding_380perPerson_group_embedding_380predicted_CTR_embedding_381perPerson_group_embedding_381predicted_CTR_embedding_382perPerson_group_embedding_382predicted_CTR_embedding_383perPerson_group_embedding_383predicted_CTR_embedding_384perPerson_group_embedding_384
00.0025660.9984680.0015320.4090450.2211840.0630770.0066190.2505740.0321950.017306...0.014411False0.014180True0.014519False0.014455True0.014379True
10.0283410.0191130.9808870.0317170.0853100.0307700.0075330.7203950.0763980.047876...0.021069True0.021140True0.021144False0.021086False0.021089False
20.0062980.0354680.9645320.3421990.3820350.0930000.0021850.1316700.0402350.008678...0.015430False0.015175False0.015451False0.015228True0.015363False
30.0087420.9986510.0013490.2049310.5336240.0671150.0014630.1081030.0649930.019770...0.012817False0.012809False0.012844True0.012904True0.012787False
40.0032790.9648110.0351890.3160800.0776220.0403880.0056670.2131700.3395490.007523...0.013880False0.013911False0.014059False0.013964True0.013878False
\n", - "

5 rows × 1201 columns

\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "
\n", - "
\n" - ] - }, - "metadata": {}, - "execution_count": 6 - } - ], - "source": [ - "data = pd.read_pickle('/content/drive/MyDrive/Computational Social Systems/Master Thesis/headline_causal_processed.pkl')\n", - "data.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "LOwbwKSmb44T" - }, - "outputs": [], - "source": [ - "from dowhy import CausalModel\n", - "\n", - "# df = pd.DataFrame(data)\n", - "def do_analysis(common_causes_main, df):\n", - " for cause in common_causes_main:\n", - " common_causes = [item for item in common_causes_main if item != cause]\n", - " embedding_list = ['embedding_%d'%number for number in range(1,385)]\n", - " common_causes.extend(embedding_list)\n", - "\n", - " causal_model = CausalModel(\n", - " data=df,\n", - " treatment='perPerson_group_'+cause,\n", - " outcome='CTR',\n", - " common_causes = common_causes,\n", - " instruments=['predicted_CTR_'+cause]\n", - " )\n", - "\n", - " # Identify the estimand\n", - " identified_estimand = causal_model.identify_effect()\n", - "\n", - " # Estimate the treatment effect using propensity score matching\n", - " estimate = causal_model.estimate_effect(\n", - " identified_estimand,\n", - " method_name=\"backdoor.propensity_score_matching\",\n", - " target_units=\"ate\",\n", - " )\n", - "\n", - "\n", - " print(cause, '-', estimate.value)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "zSwLEBZLoIZO", - "outputId": "dd18f7b0-b1f2-4d8b-e15d-7e7154d30dea" - }, - "outputs": [ - { - "metadata": { - "tags": null - }, - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:dowhy.causal_model:Causal Graph not provided. DoWhy will construct a graph based on data inputs.\n", - "WARNING:dowhy.causal_model:There are an additional 800 variables in the dataset that are not in the graph. Variable names are: '['formality_formal', 'formality_informal', 'perPerson_group_basic_length', 'perPerson_group_dale_chall_score', 'perPerson_group_embedding_1', 'perPerson_group_embedding_10', 'perPerson_group_embedding_100', 'perPerson_group_embedding_101', 'perPerson_group_embedding_102', 'perPerson_group_embedding_103', 'perPerson_group_embedding_104', 'perPerson_group_embedding_105', 'perPerson_group_embedding_106', 'perPerson_group_embedding_107', 'perPerson_group_embedding_108', 'perPerson_group_embedding_109', 'perPerson_group_embedding_11', 'perPerson_group_embedding_110', 'perPerson_group_embedding_111', 'perPerson_group_embedding_112', 'perPerson_group_embedding_113', 'perPerson_group_embedding_114', 'perPerson_group_embedding_115', 'perPerson_group_embedding_116', 'perPerson_group_embedding_117', 'perPerson_group_embedding_118', 'perPerson_group_embedding_119', 'perPerson_group_embedding_12', 'perPerson_group_embedding_120', 'perPerson_group_embedding_121', 'perPerson_group_embedding_122', 'perPerson_group_embedding_123', 'perPerson_group_embedding_124', 'perPerson_group_embedding_125', 'perPerson_group_embedding_126', 'perPerson_group_embedding_127', 'perPerson_group_embedding_128', 'perPerson_group_embedding_129', 'perPerson_group_embedding_13', 'perPerson_group_embedding_130', 'perPerson_group_embedding_131', 'perPerson_group_embedding_132', 'perPerson_group_embedding_133', 'perPerson_group_embedding_134', 'perPerson_group_embedding_135', 'perPerson_group_embedding_136', 'perPerson_group_embedding_137', 'perPerson_group_embedding_138', 'perPerson_group_embedding_139', 'perPerson_group_embedding_14', 'perPerson_group_embedding_140', 'perPerson_group_embedding_141', 'perPerson_group_embedding_142', 'perPerson_group_embedding_143', 'perPerson_group_embedding_144', 'perPerson_group_embedding_145', 'perPerson_group_embedding_146', 'perPerson_group_embedding_147', 'perPerson_group_embedding_148', 'perPerson_group_embedding_149', 'perPerson_group_embedding_15', 'perPerson_group_embedding_150', 'perPerson_group_embedding_151', 'perPerson_group_embedding_152', 'perPerson_group_embedding_153', 'perPerson_group_embedding_154', 'perPerson_group_embedding_155', 'perPerson_group_embedding_156', 'perPerson_group_embedding_157', 'perPerson_group_embedding_158', 'perPerson_group_embedding_159', 'perPerson_group_embedding_16', 'perPerson_group_embedding_160', 'perPerson_group_embedding_161', 'perPerson_group_embedding_162', 'perPerson_group_embedding_163', 'perPerson_group_embedding_164', 'perPerson_group_embedding_165', 'perPerson_group_embedding_166', 'perPerson_group_embedding_167', 'perPerson_group_embedding_168', 'perPerson_group_embedding_169', 'perPerson_group_embedding_17', 'perPerson_group_embedding_170', 'perPerson_group_embedding_171', 'perPerson_group_embedding_172', 'perPerson_group_embedding_173', 'perPerson_group_embedding_174', 'perPerson_group_embedding_175', 'perPerson_group_embedding_176', 'perPerson_group_embedding_177', 'perPerson_group_embedding_178', 'perPerson_group_embedding_179', 'perPerson_group_embedding_18', 'perPerson_group_embedding_180', 'perPerson_group_embedding_181', 'perPerson_group_embedding_182', 'perPerson_group_embedding_183', 'perPerson_group_embedding_184', 'perPerson_group_embedding_185', 'perPerson_group_embedding_186', 'perPerson_group_embedding_187', 'perPerson_group_embedding_188', 'perPerson_group_embedding_189', 'perPerson_group_embedding_19', 'perPerson_group_embedding_190', 'perPerson_group_embedding_191', 'perPerson_group_embedding_192', 'perPerson_group_embedding_193', 'perPerson_group_embedding_194', 'perPerson_group_embedding_195', 'perPerson_group_embedding_196', 'perPerson_group_embedding_197', 'perPerson_group_embedding_198', 'perPerson_group_embedding_199', 'perPerson_group_embedding_2', 'perPerson_group_embedding_20', 'perPerson_group_embedding_200', 'perPerson_group_embedding_201', 'perPerson_group_embedding_202', 'perPerson_group_embedding_203', 'perPerson_group_embedding_204', 'perPerson_group_embedding_205', 'perPerson_group_embedding_206', 'perPerson_group_embedding_207', 'perPerson_group_embedding_208', 'perPerson_group_embedding_209', 'perPerson_group_embedding_21', 'perPerson_group_embedding_210', 'perPerson_group_embedding_211', 'perPerson_group_embedding_212', 'perPerson_group_embedding_213', 'perPerson_group_embedding_214', 'perPerson_group_embedding_215', 'perPerson_group_embedding_216', 'perPerson_group_embedding_217', 'perPerson_group_embedding_218', 'perPerson_group_embedding_219', 'perPerson_group_embedding_22', 'perPerson_group_embedding_220', 'perPerson_group_embedding_221', 'perPerson_group_embedding_222', 'perPerson_group_embedding_223', 'perPerson_group_embedding_224', 'perPerson_group_embedding_225', 'perPerson_group_embedding_226', 'perPerson_group_embedding_227', 'perPerson_group_embedding_228', 'perPerson_group_embedding_229', 'perPerson_group_embedding_23', 'perPerson_group_embedding_230', 'perPerson_group_embedding_231', 'perPerson_group_embedding_232', 'perPerson_group_embedding_233', 'perPerson_group_embedding_234', 'perPerson_group_embedding_235', 'perPerson_group_embedding_236', 'perPerson_group_embedding_237', 'perPerson_group_embedding_238', 'perPerson_group_embedding_239', 'perPerson_group_embedding_24', 'perPerson_group_embedding_240', 'perPerson_group_embedding_241', 'perPerson_group_embedding_242', 'perPerson_group_embedding_243', 'perPerson_group_embedding_244', 'perPerson_group_embedding_245', 'perPerson_group_embedding_246', 'perPerson_group_embedding_247', 'perPerson_group_embedding_248', 'perPerson_group_embedding_249', 'perPerson_group_embedding_25', 'perPerson_group_embedding_250', 'perPerson_group_embedding_251', 'perPerson_group_embedding_252', 'perPerson_group_embedding_253', 'perPerson_group_embedding_254', 'perPerson_group_embedding_255', 'perPerson_group_embedding_256', 'perPerson_group_embedding_257', 'perPerson_group_embedding_258', 'perPerson_group_embedding_259', 'perPerson_group_embedding_26', 'perPerson_group_embedding_260', 'perPerson_group_embedding_261', 'perPerson_group_embedding_262', 'perPerson_group_embedding_263', 'perPerson_group_embedding_264', 'perPerson_group_embedding_265', 'perPerson_group_embedding_266', 'perPerson_group_embedding_267', 'perPerson_group_embedding_268', 'perPerson_group_embedding_269', 'perPerson_group_embedding_27', 'perPerson_group_embedding_270', 'perPerson_group_embedding_271', 'perPerson_group_embedding_272', 'perPerson_group_embedding_273', 'perPerson_group_embedding_274', 'perPerson_group_embedding_275', 'perPerson_group_embedding_276', 'perPerson_group_embedding_277', 'perPerson_group_embedding_278', 'perPerson_group_embedding_279', 'perPerson_group_embedding_28', 'perPerson_group_embedding_280', 'perPerson_group_embedding_281', 'perPerson_group_embedding_282', 'perPerson_group_embedding_283', 'perPerson_group_embedding_284', 'perPerson_group_embedding_285', 'perPerson_group_embedding_286', 'perPerson_group_embedding_287', 'perPerson_group_embedding_288', 'perPerson_group_embedding_289', 'perPerson_group_embedding_29', 'perPerson_group_embedding_290', 'perPerson_group_embedding_291', 'perPerson_group_embedding_292', 'perPerson_group_embedding_293', 'perPerson_group_embedding_294', 'perPerson_group_embedding_295', 'perPerson_group_embedding_296', 'perPerson_group_embedding_297', 'perPerson_group_embedding_298', 'perPerson_group_embedding_299', 'perPerson_group_embedding_3', 'perPerson_group_embedding_30', 'perPerson_group_embedding_300', 'perPerson_group_embedding_301', 'perPerson_group_embedding_302', 'perPerson_group_embedding_303', 'perPerson_group_embedding_304', 'perPerson_group_embedding_305', 'perPerson_group_embedding_306', 'perPerson_group_embedding_307', 'perPerson_group_embedding_308', 'perPerson_group_embedding_309', 'perPerson_group_embedding_31', 'perPerson_group_embedding_310', 'perPerson_group_embedding_311', 'perPerson_group_embedding_312', 'perPerson_group_embedding_313', 'perPerson_group_embedding_314', 'perPerson_group_embedding_315', 'perPerson_group_embedding_316', 'perPerson_group_embedding_317', 'perPerson_group_embedding_318', 'perPerson_group_embedding_319', 'perPerson_group_embedding_32', 'perPerson_group_embedding_320', 'perPerson_group_embedding_321', 'perPerson_group_embedding_322', 'perPerson_group_embedding_323', 'perPerson_group_embedding_324', 'perPerson_group_embedding_325', 'perPerson_group_embedding_326', 'perPerson_group_embedding_327', 'perPerson_group_embedding_328', 'perPerson_group_embedding_329', 'perPerson_group_embedding_33', 'perPerson_group_embedding_330', 'perPerson_group_embedding_331', 'perPerson_group_embedding_332', 'perPerson_group_embedding_333', 'perPerson_group_embedding_334', 'perPerson_group_embedding_335', 'perPerson_group_embedding_336', 'perPerson_group_embedding_337', 'perPerson_group_embedding_338', 'perPerson_group_embedding_339', 'perPerson_group_embedding_34', 'perPerson_group_embedding_340', 'perPerson_group_embedding_341', 'perPerson_group_embedding_342', 'perPerson_group_embedding_343', 'perPerson_group_embedding_344', 'perPerson_group_embedding_345', 'perPerson_group_embedding_346', 'perPerson_group_embedding_347', 'perPerson_group_embedding_348', 'perPerson_group_embedding_349', 'perPerson_group_embedding_35', 'perPerson_group_embedding_350', 'perPerson_group_embedding_351', 'perPerson_group_embedding_352', 'perPerson_group_embedding_353', 'perPerson_group_embedding_354', 'perPerson_group_embedding_355', 'perPerson_group_embedding_356', 'perPerson_group_embedding_357', 'perPerson_group_embedding_358', 'perPerson_group_embedding_359', 'perPerson_group_embedding_36', 'perPerson_group_embedding_360', 'perPerson_group_embedding_361', 'perPerson_group_embedding_362', 'perPerson_group_embedding_363', 'perPerson_group_embedding_364', 'perPerson_group_embedding_365', 'perPerson_group_embedding_366', 'perPerson_group_embedding_367', 'perPerson_group_embedding_368', 'perPerson_group_embedding_369', 'perPerson_group_embedding_37', 'perPerson_group_embedding_370', 'perPerson_group_embedding_371', 'perPerson_group_embedding_372', 'perPerson_group_embedding_373', 'perPerson_group_embedding_374', 'perPerson_group_embedding_375', 'perPerson_group_embedding_376', 'perPerson_group_embedding_377', 'perPerson_group_embedding_378', 'perPerson_group_embedding_379', 'perPerson_group_embedding_38', 'perPerson_group_embedding_380', 'perPerson_group_embedding_381', 'perPerson_group_embedding_382', 'perPerson_group_embedding_383', 'perPerson_group_embedding_384', 'perPerson_group_embedding_39', 'perPerson_group_embedding_4', 'perPerson_group_embedding_40', 'perPerson_group_embedding_41', 'perPerson_group_embedding_42', 'perPerson_group_embedding_43', 'perPerson_group_embedding_44', 'perPerson_group_embedding_45', 'perPerson_group_embedding_46', 'perPerson_group_embedding_47', 'perPerson_group_embedding_48', 'perPerson_group_embedding_49', 'perPerson_group_embedding_5', 'perPerson_group_embedding_50', 'perPerson_group_embedding_51', 'perPerson_group_embedding_52', 'perPerson_group_embedding_53', 'perPerson_group_embedding_54', 'perPerson_group_embedding_55', 'perPerson_group_embedding_56', 'perPerson_group_embedding_57', 'perPerson_group_embedding_58', 'perPerson_group_embedding_59', 'perPerson_group_embedding_6', 'perPerson_group_embedding_60', 'perPerson_group_embedding_61', 'perPerson_group_embedding_62', 'perPerson_group_embedding_63', 'perPerson_group_embedding_64', 'perPerson_group_embedding_65', 'perPerson_group_embedding_66', 'perPerson_group_embedding_67', 'perPerson_group_embedding_68', 'perPerson_group_embedding_69', 'perPerson_group_embedding_7', 'perPerson_group_embedding_70', 'perPerson_group_embedding_71', 'perPerson_group_embedding_72', 'perPerson_group_embedding_73', 'perPerson_group_embedding_74', 'perPerson_group_embedding_75', 'perPerson_group_embedding_76', 'perPerson_group_embedding_77', 'perPerson_group_embedding_78', 'perPerson_group_embedding_79', 'perPerson_group_embedding_8', 'perPerson_group_embedding_80', 'perPerson_group_embedding_81', 'perPerson_group_embedding_82', 'perPerson_group_embedding_83', 'perPerson_group_embedding_84', 'perPerson_group_embedding_85', 'perPerson_group_embedding_86', 'perPerson_group_embedding_87', 'perPerson_group_embedding_88', 'perPerson_group_embedding_89', 'perPerson_group_embedding_9', 'perPerson_group_embedding_90', 'perPerson_group_embedding_91', 'perPerson_group_embedding_92', 'perPerson_group_embedding_93', 'perPerson_group_embedding_94', 'perPerson_group_embedding_95', 'perPerson_group_embedding_96', 'perPerson_group_embedding_97', 'perPerson_group_embedding_98', 'perPerson_group_embedding_99', 'perPerson_group_emotion_anger', 'perPerson_group_emotion_disgust', 'perPerson_group_emotion_fear', 'perPerson_group_emotion_joy', 'perPerson_group_emotion_neutral', 'perPerson_group_emotion_sadness', 'perPerson_group_emotion_surprise', 'perPerson_group_formality_informal', 'perPerson_group_giberish', 'perPerson_group_sentiment_negative', 'perPerson_group_sentiment_neutral', 'perPerson_group_sentiment_positive', 'perPerson_group_syntax_complexity', 'predicted_CTR_basic_length', 'predicted_CTR_dale_chall_score', 'predicted_CTR_embedding_1', 'predicted_CTR_embedding_10', 'predicted_CTR_embedding_100', 'predicted_CTR_embedding_101', 'predicted_CTR_embedding_102', 'predicted_CTR_embedding_103', 'predicted_CTR_embedding_104', 'predicted_CTR_embedding_105', 'predicted_CTR_embedding_106', 'predicted_CTR_embedding_107', 'predicted_CTR_embedding_108', 'predicted_CTR_embedding_109', 'predicted_CTR_embedding_11', 'predicted_CTR_embedding_110', 'predicted_CTR_embedding_111', 'predicted_CTR_embedding_112', 'predicted_CTR_embedding_113', 'predicted_CTR_embedding_114', 'predicted_CTR_embedding_115', 'predicted_CTR_embedding_116', 'predicted_CTR_embedding_117', 'predicted_CTR_embedding_118', 'predicted_CTR_embedding_119', 'predicted_CTR_embedding_12', 'predicted_CTR_embedding_120', 'predicted_CTR_embedding_121', 'predicted_CTR_embedding_122', 'predicted_CTR_embedding_123', 'predicted_CTR_embedding_124', 'predicted_CTR_embedding_125', 'predicted_CTR_embedding_126', 'predicted_CTR_embedding_127', 'predicted_CTR_embedding_128', 'predicted_CTR_embedding_129', 'predicted_CTR_embedding_13', 'predicted_CTR_embedding_130', 'predicted_CTR_embedding_131', 'predicted_CTR_embedding_132', 'predicted_CTR_embedding_133', 'predicted_CTR_embedding_134', 'predicted_CTR_embedding_135', 'predicted_CTR_embedding_136', 'predicted_CTR_embedding_137', 'predicted_CTR_embedding_138', 'predicted_CTR_embedding_139', 'predicted_CTR_embedding_14', 'predicted_CTR_embedding_140', 'predicted_CTR_embedding_141', 'predicted_CTR_embedding_142', 'predicted_CTR_embedding_143', 'predicted_CTR_embedding_144', 'predicted_CTR_embedding_145', 'predicted_CTR_embedding_146', 'predicted_CTR_embedding_147', 'predicted_CTR_embedding_148', 'predicted_CTR_embedding_149', 'predicted_CTR_embedding_15', 'predicted_CTR_embedding_150', 'predicted_CTR_embedding_151', 'predicted_CTR_embedding_152', 'predicted_CTR_embedding_153', 'predicted_CTR_embedding_154', 'predicted_CTR_embedding_155', 'predicted_CTR_embedding_156', 'predicted_CTR_embedding_157', 'predicted_CTR_embedding_158', 'predicted_CTR_embedding_159', 'predicted_CTR_embedding_16', 'predicted_CTR_embedding_160', 'predicted_CTR_embedding_161', 'predicted_CTR_embedding_162', 'predicted_CTR_embedding_163', 'predicted_CTR_embedding_164', 'predicted_CTR_embedding_165', 'predicted_CTR_embedding_166', 'predicted_CTR_embedding_167', 'predicted_CTR_embedding_168', 'predicted_CTR_embedding_169', 'predicted_CTR_embedding_17', 'predicted_CTR_embedding_170', 'predicted_CTR_embedding_171', 'predicted_CTR_embedding_172', 'predicted_CTR_embedding_173', 'predicted_CTR_embedding_174', 'predicted_CTR_embedding_175', 'predicted_CTR_embedding_176', 'predicted_CTR_embedding_177', 'predicted_CTR_embedding_178', 'predicted_CTR_embedding_179', 'predicted_CTR_embedding_18', 'predicted_CTR_embedding_180', 'predicted_CTR_embedding_181', 'predicted_CTR_embedding_182', 'predicted_CTR_embedding_183', 'predicted_CTR_embedding_184', 'predicted_CTR_embedding_185', 'predicted_CTR_embedding_186', 'predicted_CTR_embedding_187', 'predicted_CTR_embedding_188', 'predicted_CTR_embedding_189', 'predicted_CTR_embedding_19', 'predicted_CTR_embedding_190', 'predicted_CTR_embedding_191', 'predicted_CTR_embedding_192', 'predicted_CTR_embedding_193', 'predicted_CTR_embedding_194', 'predicted_CTR_embedding_195', 'predicted_CTR_embedding_196', 'predicted_CTR_embedding_197', 'predicted_CTR_embedding_198', 'predicted_CTR_embedding_199', 'predicted_CTR_embedding_2', 'predicted_CTR_embedding_20', 'predicted_CTR_embedding_200', 'predicted_CTR_embedding_201', 'predicted_CTR_embedding_202', 'predicted_CTR_embedding_203', 'predicted_CTR_embedding_204', 'predicted_CTR_embedding_205', 'predicted_CTR_embedding_206', 'predicted_CTR_embedding_207', 'predicted_CTR_embedding_208', 'predicted_CTR_embedding_209', 'predicted_CTR_embedding_21', 'predicted_CTR_embedding_210', 'predicted_CTR_embedding_211', 'predicted_CTR_embedding_212', 'predicted_CTR_embedding_213', 'predicted_CTR_embedding_214', 'predicted_CTR_embedding_215', 'predicted_CTR_embedding_216', 'predicted_CTR_embedding_217', 'predicted_CTR_embedding_218', 'predicted_CTR_embedding_219', 'predicted_CTR_embedding_22', 'predicted_CTR_embedding_220', 'predicted_CTR_embedding_221', 'predicted_CTR_embedding_222', 'predicted_CTR_embedding_223', 'predicted_CTR_embedding_224', 'predicted_CTR_embedding_225', 'predicted_CTR_embedding_226', 'predicted_CTR_embedding_227', 'predicted_CTR_embedding_228', 'predicted_CTR_embedding_229', 'predicted_CTR_embedding_23', 'predicted_CTR_embedding_230', 'predicted_CTR_embedding_231', 'predicted_CTR_embedding_232', 'predicted_CTR_embedding_233', 'predicted_CTR_embedding_234', 'predicted_CTR_embedding_235', 'predicted_CTR_embedding_236', 'predicted_CTR_embedding_237', 'predicted_CTR_embedding_238', 'predicted_CTR_embedding_239', 'predicted_CTR_embedding_24', 'predicted_CTR_embedding_240', 'predicted_CTR_embedding_241', 'predicted_CTR_embedding_242', 'predicted_CTR_embedding_243', 'predicted_CTR_embedding_244', 'predicted_CTR_embedding_245', 'predicted_CTR_embedding_246', 'predicted_CTR_embedding_247', 'predicted_CTR_embedding_248', 'predicted_CTR_embedding_249', 'predicted_CTR_embedding_25', 'predicted_CTR_embedding_250', 'predicted_CTR_embedding_251', 'predicted_CTR_embedding_252', 'predicted_CTR_embedding_253', 'predicted_CTR_embedding_254', 'predicted_CTR_embedding_255', 'predicted_CTR_embedding_256', 'predicted_CTR_embedding_257', 'predicted_CTR_embedding_258', 'predicted_CTR_embedding_259', 'predicted_CTR_embedding_26', 'predicted_CTR_embedding_260', 'predicted_CTR_embedding_261', 'predicted_CTR_embedding_262', 'predicted_CTR_embedding_263', 'predicted_CTR_embedding_264', 'predicted_CTR_embedding_265', 'predicted_CTR_embedding_266', 'predicted_CTR_embedding_267', 'predicted_CTR_embedding_268', 'predicted_CTR_embedding_269', 'predicted_CTR_embedding_27', 'predicted_CTR_embedding_270', 'predicted_CTR_embedding_271', 'predicted_CTR_embedding_272', 'predicted_CTR_embedding_273', 'predicted_CTR_embedding_274', 'predicted_CTR_embedding_275', 'predicted_CTR_embedding_276', 'predicted_CTR_embedding_277', 'predicted_CTR_embedding_278', 'predicted_CTR_embedding_279', 'predicted_CTR_embedding_28', 'predicted_CTR_embedding_280', 'predicted_CTR_embedding_281', 'predicted_CTR_embedding_282', 'predicted_CTR_embedding_283', 'predicted_CTR_embedding_284', 'predicted_CTR_embedding_285', 'predicted_CTR_embedding_286', 'predicted_CTR_embedding_287', 'predicted_CTR_embedding_288', 'predicted_CTR_embedding_289', 'predicted_CTR_embedding_29', 'predicted_CTR_embedding_290', 'predicted_CTR_embedding_291', 'predicted_CTR_embedding_292', 'predicted_CTR_embedding_293', 'predicted_CTR_embedding_294', 'predicted_CTR_embedding_295', 'predicted_CTR_embedding_296', 'predicted_CTR_embedding_297', 'predicted_CTR_embedding_298', 'predicted_CTR_embedding_299', 'predicted_CTR_embedding_3', 'predicted_CTR_embedding_30', 'predicted_CTR_embedding_300', 'predicted_CTR_embedding_301', 'predicted_CTR_embedding_302', 'predicted_CTR_embedding_303', 'predicted_CTR_embedding_304', 'predicted_CTR_embedding_305', 'predicted_CTR_embedding_306', 'predicted_CTR_embedding_307', 'predicted_CTR_embedding_308', 'predicted_CTR_embedding_309', 'predicted_CTR_embedding_31', 'predicted_CTR_embedding_310', 'predicted_CTR_embedding_311', 'predicted_CTR_embedding_312', 'predicted_CTR_embedding_313', 'predicted_CTR_embedding_314', 'predicted_CTR_embedding_315', 'predicted_CTR_embedding_316', 'predicted_CTR_embedding_317', 'predicted_CTR_embedding_318', 'predicted_CTR_embedding_319', 'predicted_CTR_embedding_32', 'predicted_CTR_embedding_320', 'predicted_CTR_embedding_321', 'predicted_CTR_embedding_322', 'predicted_CTR_embedding_323', 'predicted_CTR_embedding_324', 'predicted_CTR_embedding_325', 'predicted_CTR_embedding_326', 'predicted_CTR_embedding_327', 'predicted_CTR_embedding_328', 'predicted_CTR_embedding_329', 'predicted_CTR_embedding_33', 'predicted_CTR_embedding_330', 'predicted_CTR_embedding_331', 'predicted_CTR_embedding_332', 'predicted_CTR_embedding_333', 'predicted_CTR_embedding_334', 'predicted_CTR_embedding_335', 'predicted_CTR_embedding_336', 'predicted_CTR_embedding_337', 'predicted_CTR_embedding_338', 'predicted_CTR_embedding_339', 'predicted_CTR_embedding_34', 'predicted_CTR_embedding_340', 'predicted_CTR_embedding_341', 'predicted_CTR_embedding_342', 'predicted_CTR_embedding_343', 'predicted_CTR_embedding_344', 'predicted_CTR_embedding_345', 'predicted_CTR_embedding_346', 'predicted_CTR_embedding_347', 'predicted_CTR_embedding_348', 'predicted_CTR_embedding_349', 'predicted_CTR_embedding_35', 'predicted_CTR_embedding_350', 'predicted_CTR_embedding_351', 'predicted_CTR_embedding_352', 'predicted_CTR_embedding_353', 'predicted_CTR_embedding_354', 'predicted_CTR_embedding_355', 'predicted_CTR_embedding_356', 'predicted_CTR_embedding_357', 'predicted_CTR_embedding_358', 'predicted_CTR_embedding_359', 'predicted_CTR_embedding_36', 'predicted_CTR_embedding_360', 'predicted_CTR_embedding_361', 'predicted_CTR_embedding_362', 'predicted_CTR_embedding_363', 'predicted_CTR_embedding_364', 'predicted_CTR_embedding_365', 'predicted_CTR_embedding_366', 'predicted_CTR_embedding_367', 'predicted_CTR_embedding_368', 'predicted_CTR_embedding_369', 'predicted_CTR_embedding_37', 'predicted_CTR_embedding_370', 'predicted_CTR_embedding_371', 'predicted_CTR_embedding_372', 'predicted_CTR_embedding_373', 'predicted_CTR_embedding_374', 'predicted_CTR_embedding_375', 'predicted_CTR_embedding_376', 'predicted_CTR_embedding_377', 'predicted_CTR_embedding_378', 'predicted_CTR_embedding_379', 'predicted_CTR_embedding_38', 'predicted_CTR_embedding_380', 'predicted_CTR_embedding_381', 'predicted_CTR_embedding_382', 'predicted_CTR_embedding_383', 'predicted_CTR_embedding_384', 'predicted_CTR_embedding_39', 'predicted_CTR_embedding_4', 'predicted_CTR_embedding_40', 'predicted_CTR_embedding_41', 'predicted_CTR_embedding_42', 'predicted_CTR_embedding_43', 'predicted_CTR_embedding_44', 'predicted_CTR_embedding_45', 'predicted_CTR_embedding_46', 'predicted_CTR_embedding_47', 'predicted_CTR_embedding_48', 'predicted_CTR_embedding_49', 'predicted_CTR_embedding_5', 'predicted_CTR_embedding_50', 'predicted_CTR_embedding_51', 'predicted_CTR_embedding_52', 'predicted_CTR_embedding_53', 'predicted_CTR_embedding_54', 'predicted_CTR_embedding_55', 'predicted_CTR_embedding_56', 'predicted_CTR_embedding_57', 'predicted_CTR_embedding_58', 'predicted_CTR_embedding_59', 'predicted_CTR_embedding_6', 'predicted_CTR_embedding_60', 'predicted_CTR_embedding_61', 'predicted_CTR_embedding_62', 'predicted_CTR_embedding_63', 'predicted_CTR_embedding_64', 'predicted_CTR_embedding_65', 'predicted_CTR_embedding_66', 'predicted_CTR_embedding_67', 'predicted_CTR_embedding_68', 'predicted_CTR_embedding_69', 'predicted_CTR_embedding_7', 'predicted_CTR_embedding_70', 'predicted_CTR_embedding_71', 'predicted_CTR_embedding_72', 'predicted_CTR_embedding_73', 'predicted_CTR_embedding_74', 'predicted_CTR_embedding_75', 'predicted_CTR_embedding_76', 'predicted_CTR_embedding_77', 'predicted_CTR_embedding_78', 'predicted_CTR_embedding_79', 'predicted_CTR_embedding_8', 'predicted_CTR_embedding_80', 'predicted_CTR_embedding_81', 'predicted_CTR_embedding_82', 'predicted_CTR_embedding_83', 'predicted_CTR_embedding_84', 'predicted_CTR_embedding_85', 'predicted_CTR_embedding_86', 'predicted_CTR_embedding_87', 'predicted_CTR_embedding_88', 'predicted_CTR_embedding_89', 'predicted_CTR_embedding_9', 'predicted_CTR_embedding_90', 'predicted_CTR_embedding_91', 'predicted_CTR_embedding_92', 'predicted_CTR_embedding_93', 'predicted_CTR_embedding_94', 'predicted_CTR_embedding_95', 'predicted_CTR_embedding_96', 'predicted_CTR_embedding_97', 'predicted_CTR_embedding_98', 'predicted_CTR_embedding_99', 'predicted_CTR_emotion_anger', 'predicted_CTR_emotion_disgust', 'predicted_CTR_emotion_fear', 'predicted_CTR_emotion_joy', 'predicted_CTR_emotion_neutral', 'predicted_CTR_emotion_sadness', 'predicted_CTR_emotion_surprise', 'predicted_CTR_formality_informal', 'predicted_CTR_giberish', 'predicted_CTR_sentiment_negative', 'predicted_CTR_sentiment_neutral', 'predicted_CTR_sentiment_positive', 'predicted_CTR_syntax_complexity']'\n" - ] - } - ], - "source": [ - "causes_list = ['formality_formal',\n", - "# 'formality_informal',\n", - " 'emotion_anger',\n", - " 'emotion_disgust',\n", - " 'emotion_fear',\n", - " 'emotion_joy',\n", - " 'emotion_neutral',\n", - " 'emotion_sadness',\n", - " 'emotion_surprise',\n", - " 'syntax_complexity',\n", - " 'basic_length',\n", - " 'giberish',\n", - " 'dale_chall_score',\n", - " 'sentiment_negative',\n", - " 'sentiment_neutral',\n", - " 'sentiment_positive']\n", - "\n", - "\n", - "do_analysis(causes_list, data)" - ] - }, - { - "cell_type": "code", - "source": [ - "data[['formality_formal', 'predicted_CTR_formality_formal', 'perPerson_group_formality_formal']]" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 424 - }, - "id": "C3FfBjikf3JO", - "outputId": "19509dec-8357-4e80-84ee-41185f780a9a" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " formality_formal predicted_CTR_formality_formal \\\n", - "0 0.998468 0.014401 \n", - "1 0.019113 0.021041 \n", - "2 0.035468 0.015435 \n", - "3 0.998651 0.012756 \n", - "4 0.964811 0.013904 \n", - "... ... ... \n", - "105546 0.998615 0.010407 \n", - "105547 0.784095 0.019867 \n", - "105548 0.675907 0.028118 \n", - "105549 0.967737 0.015195 \n", - "105550 0.998637 0.015504 \n", - "\n", - " perPerson_group_formality_formal \n", - "0 True \n", - "1 False \n", - "2 False \n", - "3 True \n", - "4 True \n", - "... ... \n", - "105546 True \n", - "105547 False \n", - "105548 False \n", - "105549 True \n", - "105550 True \n", - "\n", - "[105551 rows x 3 columns]" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
formality_formalpredicted_CTR_formality_formalperPerson_group_formality_formal
00.9984680.014401True
10.0191130.021041False
20.0354680.015435False
30.9986510.012756True
40.9648110.013904True
............
1055460.9986150.010407True
1055470.7840950.019867False
1055480.6759070.028118False
1055490.9677370.015195True
1055500.9986370.015504True
\n", - "

105551 rows × 3 columns

\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "
\n", - "
\n" - ] - }, - "metadata": {}, - "execution_count": 9 - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "Effect" - ], - "metadata": { - "id": "r6fTzVvcEf6u" - } - }, - { - "cell_type": "code", - "source": [ - "embedding_list = ['embedding_%d'%number for number in range(1,385)]\n", - "np_array = data[embedding_list].head(20000).to_numpy()" - ], - "metadata": { - "id": "ebs--p0eeGsA" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "similarity = np.dot(np_array, np_array.T)\n", - "\n", - "# squared magnitude of preference vectors (number of occurrences)\n", - "square_mag = np.diag(similarity)\n", - "\n", - "# inverse squared magnitude\n", - "inv_square_mag = 1 / square_mag\n", - "\n", - "# if it doesn't occur, set it's inverse magnitude to zero (instead of inf)\n", - "inv_square_mag[np.isinf(inv_square_mag)] = 0\n", - "\n", - "# inverse of the magnitude\n", - "inv_mag = np.sqrt(inv_square_mag)\n", - "\n", - "# cosine similarity (elementwise multiply by inverse magnitudes)\n", - "cosine = similarity * inv_mag\n", - "cosine = cosine.T * inv_mag\n", - "\n", - "cosine.shape" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "pEZZ8-yBeRFo", - "outputId": "642a674a-7ad4-4541-bc0b-88013bb620e1" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "(20000, 20000)" - ] - }, - "metadata": {}, - "execution_count": 15 - } - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WzZmQPxp4mS0", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 293 - }, - "outputId": "1c1cbfd8-d1c3-443a-fe9d-bb32ee4c1269" - }, - "outputs": [ - { - "output_type": "error", - "ename": "NameError", - "evalue": "ignored", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcosineformality_formal\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m0.0009003530568299572\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0memotion_anger\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m0.0010072101737743074\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0memotion_disgust\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m0.002301255130395963\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0memotion_fear\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m0.001198326734492035\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0memotion_joy\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m0.000985384507931537\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mNameError\u001b[0m: name 'cosineformality_formal' is not defined" - ] - } - ], - "source": [ - "formality_formal - -0.0009003530568299572\n", - "emotion_anger - 0.0010072101737743074\n", - "emotion_disgust - 0.002301255130395963\n", - "emotion_fear - 0.001198326734492035\n", - "emotion_joy - -0.000985384507931537\n", - "emotion_neutral - -0.002278599202807139\n", - "emotion_sadness - 0.0014750827003031533\n", - "emotion_surprise - -0.0005571921426504084\n", - "syntax_complexity - 0.0017304779548953698\n", - "basic_length - -0.0017429606553956436\n", - "giberish - 0.00033531771473939014\n", - "dale_chall_score - 0.0017182004012356183\n", - "sentiment_negative - 0.0022922630018918674\n", - "sentiment_neutral - -0.0012223326278845113\n", - "sentiment_positive- -0.0021200611400042342\n", - "\n" - ] - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "YClKlm3oeu5W" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "-----------------------------------------------------------------------------" - ], - "metadata": { - "id": "cnuEroYHZbxJ" - } - }, - { - "cell_type": "code", - "source": [ - "!pip install nougat-ocr" - ], - "metadata": { - "id": "0mEHALxTZdtH", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "b7131460-a166-449d-b3c5-37c8bd96d1e4" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Collecting nougat-ocr\n", - " Downloading nougat_ocr-0.1.8-py3-none-any.whl (80 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m80.2/80.2 kB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting transformers>=4.25.1 (from nougat-ocr)\n", - " Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.6/7.6 MB\u001b[0m \u001b[31m69.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting timm==0.5.4 (from nougat-ocr)\n", - " Downloading timm-0.5.4-py3-none-any.whl (431 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m431.5/431.5 kB\u001b[0m \u001b[31m44.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting orjson (from nougat-ocr)\n", - " Downloading orjson-3.9.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (138 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m138.7/138.7 kB\u001b[0m \u001b[31m14.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: opencv-python-headless in /usr/local/lib/python3.10/dist-packages (from nougat-ocr) (4.8.0.76)\n", - "Collecting datasets[vision] (from nougat-ocr)\n", - " Downloading datasets-2.14.5-py3-none-any.whl (519 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m519.6/519.6 kB\u001b[0m \u001b[31m53.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting lightning>=2.0.0 (from nougat-ocr)\n", - " Downloading lightning-2.0.9-py3-none-any.whl (1.9 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.9/1.9 MB\u001b[0m \u001b[31m96.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from nougat-ocr) (3.8.1)\n", - "Collecting python-Levenshtein (from nougat-ocr)\n", - " Downloading python_Levenshtein-0.21.1-py3-none-any.whl (9.4 kB)\n", - "Collecting sentencepiece (from nougat-ocr)\n", - " Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m93.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting sconf>=0.2.3 (from nougat-ocr)\n", - " Downloading sconf-0.2.5-py3-none-any.whl (8.8 kB)\n", - "Requirement already satisfied: albumentations in /usr/local/lib/python3.10/dist-packages (from nougat-ocr) (1.3.1)\n", - "Collecting pymupdf (from nougat-ocr)\n", - " Downloading PyMuPDF-1.23.3-cp310-none-manylinux2014_x86_64.whl (4.3 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.3/4.3 MB\u001b[0m \u001b[31m104.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: torch>=1.4 in /usr/local/lib/python3.10/dist-packages (from timm==0.5.4->nougat-ocr) (2.0.1+cu118)\n", - "Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (from timm==0.5.4->nougat-ocr) (0.15.2+cu118)\n", - "Requirement already satisfied: Jinja2<5.0 in /usr/local/lib/python3.10/dist-packages (from lightning>=2.0.0->nougat-ocr) (3.1.2)\n", - "Requirement already satisfied: PyYAML<8.0 in /usr/local/lib/python3.10/dist-packages (from lightning>=2.0.0->nougat-ocr) (6.0.1)\n", - "Collecting arrow<3.0,>=1.2.0 (from lightning>=2.0.0->nougat-ocr)\n", - " Downloading arrow-1.2.3-py3-none-any.whl (66 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m66.4/66.4 kB\u001b[0m \u001b[31m9.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting backoff<4.0,>=2.2.1 (from lightning>=2.0.0->nougat-ocr)\n", - " Downloading backoff-2.2.1-py3-none-any.whl (15 kB)\n", - "Requirement already satisfied: beautifulsoup4<6.0,>=4.8.0 in /usr/local/lib/python3.10/dist-packages (from lightning>=2.0.0->nougat-ocr) (4.11.2)\n", - "Requirement already satisfied: click<10.0 in /usr/local/lib/python3.10/dist-packages (from lightning>=2.0.0->nougat-ocr) (8.1.7)\n", - "Collecting croniter<1.5.0,>=1.3.0 (from lightning>=2.0.0->nougat-ocr)\n", - " Downloading croniter-1.4.1-py2.py3-none-any.whl (19 kB)\n", - "Collecting dateutils<2.0 (from lightning>=2.0.0->nougat-ocr)\n", - " Downloading dateutils-0.6.12-py2.py3-none-any.whl (5.7 kB)\n", - "Collecting deepdiff<8.0,>=5.7.0 (from lightning>=2.0.0->nougat-ocr)\n", - " Downloading deepdiff-6.5.0-py3-none-any.whl (71 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.3/71.3 kB\u001b[0m \u001b[31m10.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting fastapi<2.0,>=0.92.0 (from lightning>=2.0.0->nougat-ocr)\n", - " Downloading fastapi-0.103.1-py3-none-any.whl (66 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m66.2/66.2 kB\u001b[0m \u001b[31m9.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: fsspec<2025.0,>=2022.5.0 in /usr/local/lib/python3.10/dist-packages (from lightning>=2.0.0->nougat-ocr) (2023.6.0)\n", - "Collecting inquirer<5.0,>=2.10.0 (from lightning>=2.0.0->nougat-ocr)\n", - " Downloading inquirer-3.1.3-py3-none-any.whl (18 kB)\n", - "Collecting lightning-cloud>=0.5.38 (from lightning>=2.0.0->nougat-ocr)\n", - " Downloading lightning_cloud-0.5.38-py3-none-any.whl (659 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m660.0/660.0 kB\u001b[0m \u001b[31m61.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting lightning-utilities<2.0,>=0.7.0 (from lightning>=2.0.0->nougat-ocr)\n", - " Downloading lightning_utilities-0.9.0-py3-none-any.whl (23 kB)\n", - "Requirement already satisfied: numpy<3.0,>=1.17.2 in /usr/local/lib/python3.10/dist-packages (from lightning>=2.0.0->nougat-ocr) (1.23.5)\n", - "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from lightning>=2.0.0->nougat-ocr) (23.1)\n", - "Requirement already satisfied: psutil<7.0 in /usr/local/lib/python3.10/dist-packages (from lightning>=2.0.0->nougat-ocr) (5.9.5)\n", - "Requirement already satisfied: pydantic<2.2.0,>=1.7.4 in /usr/local/lib/python3.10/dist-packages (from lightning>=2.0.0->nougat-ocr) (1.10.12)\n", - "Collecting python-multipart<2.0,>=0.0.5 (from lightning>=2.0.0->nougat-ocr)\n", - " Downloading python_multipart-0.0.6-py3-none-any.whl (45 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.7/45.7 kB\u001b[0m \u001b[31m6.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: requests<4.0 in /usr/local/lib/python3.10/dist-packages (from lightning>=2.0.0->nougat-ocr) (2.31.0)\n", - "Requirement already satisfied: rich<15.0,>=12.3.0 in /usr/local/lib/python3.10/dist-packages (from lightning>=2.0.0->nougat-ocr) (13.5.2)\n", - "Collecting starlette (from lightning>=2.0.0->nougat-ocr)\n", - " Downloading starlette-0.31.1-py3-none-any.whl (69 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m69.9/69.9 kB\u001b[0m \u001b[31m7.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting starsessions<2.0,>=1.2.1 (from lightning>=2.0.0->nougat-ocr)\n", - " Downloading starsessions-1.3.0-py3-none-any.whl (10 kB)\n", - "Collecting torchmetrics<3.0,>=0.7.0 (from lightning>=2.0.0->nougat-ocr)\n", - " Downloading torchmetrics-1.1.2-py3-none-any.whl (764 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m764.8/764.8 kB\u001b[0m \u001b[31m50.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: tqdm<6.0,>=4.57.0 in /usr/local/lib/python3.10/dist-packages (from lightning>=2.0.0->nougat-ocr) (4.66.1)\n", - "Requirement already satisfied: traitlets<7.0,>=5.3.0 in /usr/local/lib/python3.10/dist-packages (from lightning>=2.0.0->nougat-ocr) (5.7.1)\n", - "Requirement already satisfied: typing-extensions<6.0,>=4.0.0 in /usr/local/lib/python3.10/dist-packages (from lightning>=2.0.0->nougat-ocr) (4.5.0)\n", - "Requirement already satisfied: urllib3<4.0 in /usr/local/lib/python3.10/dist-packages (from lightning>=2.0.0->nougat-ocr) (2.0.4)\n", - "Collecting uvicorn<2.0 (from lightning>=2.0.0->nougat-ocr)\n", - " Downloading uvicorn-0.23.2-py3-none-any.whl (59 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.5/59.5 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: websocket-client<3.0 in /usr/local/lib/python3.10/dist-packages (from lightning>=2.0.0->nougat-ocr) (1.6.2)\n", - "Collecting websockets<13.0 (from lightning>=2.0.0->nougat-ocr)\n", - " Downloading websockets-11.0.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (129 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m129.9/129.9 kB\u001b[0m \u001b[31m10.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting pytorch-lightning (from lightning>=2.0.0->nougat-ocr)\n", - " Downloading pytorch_lightning-2.0.9-py3-none-any.whl (727 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m727.7/727.7 kB\u001b[0m \u001b[31m49.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting ruamel.yaml (from sconf>=0.2.3->nougat-ocr)\n", - " Downloading ruamel.yaml-0.17.32-py3-none-any.whl (112 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m112.2/112.2 kB\u001b[0m \u001b[31m11.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting munch (from sconf>=0.2.3->nougat-ocr)\n", - " Downloading munch-4.0.0-py2.py3-none-any.whl (9.9 kB)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers>=4.25.1->nougat-ocr) (3.12.2)\n", - "Collecting huggingface-hub<1.0,>=0.15.1 (from transformers>=4.25.1->nougat-ocr)\n", - " Downloading huggingface_hub-0.17.2-py3-none-any.whl (294 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m294.9/294.9 kB\u001b[0m \u001b[31m27.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.25.1->nougat-ocr) (2023.6.3)\n", - "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers>=4.25.1->nougat-ocr)\n", - " Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m108.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting safetensors>=0.3.1 (from transformers>=4.25.1->nougat-ocr)\n", - " Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m82.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: scipy>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from albumentations->nougat-ocr) (1.11.2)\n", - "Requirement already satisfied: scikit-image>=0.16.1 in /usr/local/lib/python3.10/dist-packages (from albumentations->nougat-ocr) (0.19.3)\n", - "Requirement already satisfied: qudida>=0.0.4 in /usr/local/lib/python3.10/dist-packages (from albumentations->nougat-ocr) (0.0.4)\n", - "Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets[vision]->nougat-ocr) (9.0.0)\n", - "Collecting dill<0.3.8,>=0.3.0 (from datasets[vision]->nougat-ocr)\n", - " Downloading dill-0.3.7-py3-none-any.whl (115 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m14.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets[vision]->nougat-ocr) (1.5.3)\n", - "Collecting xxhash (from datasets[vision]->nougat-ocr)\n", - " Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m24.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting multiprocess (from datasets[vision]->nougat-ocr)\n", - " Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m16.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets[vision]->nougat-ocr) (3.8.5)\n", - "Requirement already satisfied: Pillow>=6.2.1 in /usr/local/lib/python3.10/dist-packages (from datasets[vision]->nougat-ocr) (9.4.0)\n", - "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk->nougat-ocr) (1.3.2)\n", - "Collecting PyMuPDFb==1.23.3 (from pymupdf->nougat-ocr)\n", - " Downloading PyMuPDFb-1.23.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.6 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m30.6/30.6 MB\u001b[0m \u001b[31m29.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting Levenshtein==0.21.1 (from python-Levenshtein->nougat-ocr)\n", - " Downloading Levenshtein-0.21.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (172 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m172.5/172.5 kB\u001b[0m \u001b[31m18.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting rapidfuzz<4.0.0,>=2.3.0 (from Levenshtein==0.21.1->python-Levenshtein->nougat-ocr)\n", - " Downloading rapidfuzz-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m101.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: python-dateutil>=2.7.0 in /usr/local/lib/python3.10/dist-packages (from arrow<3.0,>=1.2.0->lightning>=2.0.0->nougat-ocr) (2.8.2)\n", - "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4<6.0,>=4.8.0->lightning>=2.0.0->nougat-ocr) (2.5)\n", - "Requirement already satisfied: pytz in /usr/local/lib/python3.10/dist-packages (from dateutils<2.0->lightning>=2.0.0->nougat-ocr) (2023.3.post1)\n", - "Collecting ordered-set<4.2.0,>=4.0.2 (from deepdiff<8.0,>=5.7.0->lightning>=2.0.0->nougat-ocr)\n", - " Downloading ordered_set-4.1.0-py3-none-any.whl (7.6 kB)\n", - "Requirement already satisfied: anyio<4.0.0,>=3.7.1 in /usr/local/lib/python3.10/dist-packages (from fastapi<2.0,>=0.92.0->lightning>=2.0.0->nougat-ocr) (3.7.1)\n", - "Collecting starlette (from lightning>=2.0.0->nougat-ocr)\n", - " Downloading starlette-0.27.0-py3-none-any.whl (66 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.0/67.0 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets[vision]->nougat-ocr) (23.1.0)\n", - "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets[vision]->nougat-ocr) (3.2.0)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets[vision]->nougat-ocr) (6.0.4)\n", - "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets[vision]->nougat-ocr) (4.0.3)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets[vision]->nougat-ocr) (1.9.2)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets[vision]->nougat-ocr) (1.4.0)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets[vision]->nougat-ocr) (1.3.1)\n", - "Collecting blessed>=1.19.0 (from inquirer<5.0,>=2.10.0->lightning>=2.0.0->nougat-ocr)\n", - " Downloading blessed-1.20.0-py2.py3-none-any.whl (58 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.4/58.4 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting python-editor>=1.0.4 (from inquirer<5.0,>=2.10.0->lightning>=2.0.0->nougat-ocr)\n", - " Downloading python_editor-1.0.4-py3-none-any.whl (4.9 kB)\n", - "Collecting readchar>=3.0.6 (from inquirer<5.0,>=2.10.0->lightning>=2.0.0->nougat-ocr)\n", - " Downloading readchar-4.0.5-py3-none-any.whl (8.5 kB)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from Jinja2<5.0->lightning>=2.0.0->nougat-ocr) (2.1.3)\n", - "Requirement already satisfied: pyjwt in /usr/lib/python3/dist-packages (from lightning-cloud>=0.5.38->lightning>=2.0.0->nougat-ocr) (2.3.0)\n", - "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from lightning-cloud>=0.5.38->lightning>=2.0.0->nougat-ocr) (1.16.0)\n", - "Requirement already satisfied: scikit-learn>=0.19.1 in /usr/local/lib/python3.10/dist-packages (from qudida>=0.0.4->albumentations->nougat-ocr) (1.2.2)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<4.0->lightning>=2.0.0->nougat-ocr) (3.4)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<4.0->lightning>=2.0.0->nougat-ocr) (2023.7.22)\n", - "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich<15.0,>=12.3.0->lightning>=2.0.0->nougat-ocr) (3.0.0)\n", - "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich<15.0,>=12.3.0->lightning>=2.0.0->nougat-ocr) (2.16.1)\n", - "Requirement already satisfied: networkx>=2.2 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.16.1->albumentations->nougat-ocr) (3.1)\n", - "Requirement already satisfied: imageio>=2.4.1 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.16.1->albumentations->nougat-ocr) (2.31.3)\n", - "Requirement already satisfied: tifffile>=2019.7.26 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.16.1->albumentations->nougat-ocr) (2023.8.30)\n", - "Requirement already satisfied: PyWavelets>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.16.1->albumentations->nougat-ocr) (1.4.1)\n", - "Requirement already satisfied: itsdangerous<3.0.0,>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from starsessions<2.0,>=1.2.1->lightning>=2.0.0->nougat-ocr) (2.1.2)\n", - "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.4->timm==0.5.4->nougat-ocr) (1.12)\n", - "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.4->timm==0.5.4->nougat-ocr) (2.0.0)\n", - "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.4->timm==0.5.4->nougat-ocr) (3.27.4.1)\n", - "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.4->timm==0.5.4->nougat-ocr) (16.0.6)\n", - "Collecting h11>=0.8 (from uvicorn<2.0->lightning>=2.0.0->nougat-ocr)\n", - " Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting ruamel.yaml.clib>=0.2.7 (from ruamel.yaml->sconf>=0.2.3->nougat-ocr)\n", - " Downloading ruamel.yaml.clib-0.2.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (485 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m485.6/485.6 kB\u001b[0m \u001b[31m42.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.10/dist-packages (from anyio<4.0.0,>=3.7.1->fastapi<2.0,>=0.92.0->lightning>=2.0.0->nougat-ocr) (1.3.0)\n", - "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<4.0.0,>=3.7.1->fastapi<2.0,>=0.92.0->lightning>=2.0.0->nougat-ocr) (1.1.3)\n", - "Requirement already satisfied: wcwidth>=0.1.4 in /usr/local/lib/python3.10/dist-packages (from blessed>=1.19.0->inquirer<5.0,>=2.10.0->lightning>=2.0.0->nougat-ocr) (0.2.6)\n", - "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich<15.0,>=12.3.0->lightning>=2.0.0->nougat-ocr) (0.1.2)\n", - "Requirement already satisfied: setuptools>=41.0 in /usr/local/lib/python3.10/dist-packages (from readchar>=3.0.6->inquirer<5.0,>=2.10.0->lightning>=2.0.0->nougat-ocr) (67.7.2)\n", - "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.19.1->qudida>=0.0.4->albumentations->nougat-ocr) (3.2.0)\n", - "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.4->timm==0.5.4->nougat-ocr) (1.3.0)\n", - "Installing collected packages: tokenizers, sentencepiece, safetensors, python-editor, xxhash, websockets, ruamel.yaml.clib, readchar, rapidfuzz, python-multipart, PyMuPDFb, orjson, ordered-set, munch, lightning-utilities, h11, dill, blessed, backoff, uvicorn, starlette, ruamel.yaml, pymupdf, multiprocess, Levenshtein, inquirer, huggingface-hub, deepdiff, dateutils, croniter, arrow, transformers, starsessions, sconf, python-Levenshtein, fastapi, lightning-cloud, datasets, torchmetrics, pytorch-lightning, timm, lightning, nougat-ocr\n", - "Successfully installed Levenshtein-0.21.1 PyMuPDFb-1.23.3 arrow-1.2.3 backoff-2.2.1 blessed-1.20.0 croniter-1.4.1 datasets-2.14.5 dateutils-0.6.12 deepdiff-6.5.0 dill-0.3.7 fastapi-0.103.1 h11-0.14.0 huggingface-hub-0.17.2 inquirer-3.1.3 lightning-2.0.9 lightning-cloud-0.5.38 lightning-utilities-0.9.0 multiprocess-0.70.15 munch-4.0.0 nougat-ocr-0.1.8 ordered-set-4.1.0 orjson-3.9.7 pymupdf-1.23.3 python-Levenshtein-0.21.1 python-editor-1.0.4 python-multipart-0.0.6 pytorch-lightning-2.0.9 rapidfuzz-3.3.0 readchar-4.0.5 ruamel.yaml-0.17.32 ruamel.yaml.clib-0.2.7 safetensors-0.3.3 sconf-0.2.5 sentencepiece-0.1.99 starlette-0.27.0 starsessions-1.3.0 timm-0.5.4 tokenizers-0.13.3 torchmetrics-1.1.2 transformers-4.33.2 uvicorn-0.23.2 websockets-11.0.3 xxhash-3.3.0\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "!pip install tensorrt" - ], - "metadata": { - "id": "kluQWbP4d3-D", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "c1d6bafc-b667-46dc-972c-af5cdc774876" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Collecting tensorrt\n", - " Downloading tensorrt-8.6.1.post1.tar.gz (18 kB)\n", - " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "Building wheels for collected packages: tensorrt\n", - " Building wheel for tensorrt (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for tensorrt: filename=tensorrt-8.6.1.post1-py2.py3-none-any.whl size=17281 sha256=0a38ba23a5c2459f9a8e272fe2d11f68cc982ac8627b1ed1007a95d0eae92f36\n", - " Stored in directory: /root/.cache/pip/wheels/f4/c8/0e/b79b08e45752491b9acfdbd69e8a609e8b2ed7640dda5a3e59\n", - "Successfully built tensorrt\n", - "Installing collected packages: tensorrt\n", - "Successfully installed tensorrt-8.6.1.post1\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "\n", - "%%time\n", - "!nougat /content/common_sense.pdf -o output_directory" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "12MlSe7MZfBO", - "outputId": "864984c2-6bb9-43e2-8152-e375ecf22ca5" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "2023-09-19 16:42:31.152389: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", - "/usr/local/lib/python3.10/dist-packages/torch/functional.py:504: UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at ../aten/src/ATen/native/TensorShape.cpp:3483.)\n", - " return _VF.meshgrid(tensors, **kwargs) # type: ignore[attr-defined]\n", - " 0% 0/14 [00:00