Print number of rows for each table

gunesacar · gunesacar · commit 32ec2b8fe372 · 2018-07-24T19:47:31.000-04:00
Rename CrawlHistory to crawl_history (Fixes #11) When assigning visit_ids follow the site_url order from the http_requests table.
diff --git a/analyze_crawl.py b/analyze_crawl.py
@@ -4,12 +4,12 @@
 import os
 from os.path import isfile, join
 from collections import defaultdict
-from tqdm import tqdm
+# from tqdm import tqdm
 import util
 from db_schema import (HTTP_REQUESTS_TABLE,
                        HTTP_RESPONSES_TABLE,
-                       JAVASCRIPT_TABLE)
-from util import dump_as_json
+                       JAVASCRIPT_TABLE, OPENWPM_TABLES)
+from util import dump_as_json, get_table_and_column_names
 
 
 class CrawlDBAnalysis(object):
@@ -44,21 +44,16 @@ def run_all_streaming_analysis(self):
 
     def get_visit_id_site_url_mapping(self):
         visit_id_site_urls = {}
-        from time import time
-        t0 = time()
         for visit_id, site_url in self.db_conn.execute(
                 "SELECT visit_id, site_url FROM site_visits"):
             visit_id_site_urls[visit_id] = site_url
-        print len(visit_id_site_urls), "Mappings. Took %s s" % (time() - t0)
+        print len(visit_id_site_urls), "mappings"
         print "Distinct site urls", len(set(visit_id_site_urls.values()))
         return visit_id_site_urls
 
     def run_streaming_analysis_for_table(self, table_name):
         current_visit_ids = {}
         processed = 0
-        num_rows = self.db_conn.execute(
-                "SELECT MAX(id) FROM %s" % table_name).fetchone()[0]
-        print "Total rows", num_rows, table_name
         cols_to_select = ["visit_id", "crawl_id"]
         if table_name == HTTP_REQUESTS_TABLE:
             cols_to_select.append("url")
@@ -73,7 +68,7 @@ def run_streaming_analysis_for_table(self, table_name):
                 pass
 
         query = "SELECT %s FROM %s" % (",".join(cols_to_select), table_name)
-        for row in tqdm(self.db_conn.execute(query)):
+        for row in self.db_conn.execute(query):
             processed += 1
             visit_id = int(row["visit_id"])
             crawl_id = int(row["crawl_id"])
@@ -111,15 +106,33 @@ def run_streaming_analysis_for_table(self, table_name):
             # end of the data from the current visit
             elif visit_id > current_visit_ids[crawl_id]:
                 # self.process_visit_data(current_visit_data[crawl_id])
-                if site_url in self.sv_third_parties:
-                    del self.sv_third_parties[site_url]
+                # if site_url in self.sv_third_parties:
+                #    del self.sv_third_parties[site_url]
                 current_visit_ids[crawl_id] = visit_id
             elif visit_id < current_visit_ids[crawl_id] and visit_id > 0:
-                raise Exception(
-                    "Out of order row! Curr: %s Row: %s Crawl id: %s" %
-                    (current_visit_ids[crawl_id], visit_id, crawl_id))
+                # raise Exception(
+                #    "Out of order row! Curr: %s Row: %s Crawl id: %s" %
+                #    (current_visit_ids[crawl_id], visit_id, crawl_id))
+                print "Warning: Out of order row! Curr: %s Row: %s Crawl id: %s" % (current_visit_ids[crawl_id], visit_id, crawl_id)
+
         self.dump_crawl_data(table_name)
 
+    def print_num_of_rows(self):
+        print "Will print the number of rows"
+        db_schema_str = get_table_and_column_names(self.db_path)
+        for table_name in OPENWPM_TABLES:
+            # TODO: search in table names instead of the db schema
+            if table_name in db_schema_str:
+                try:
+                    num_rows = self.db_conn.execute(
+                        "SELECT MAX(id) FROM %s" % table_name).fetchone()[0]
+                except sqlite3.OperationalError:
+                    num_rows = self.db_conn.execute(
+                        "SELECT COUNT(*) FROM %s" % table_name).fetchone()[0]
+                if num_rows is None:
+                    num_rows = 0
+                print "Total rows", table_name, num_rows
+
     def dump_crawl_data(self, table_name):
         if table_name == HTTP_REQUESTS_TABLE:
             self.dump_json(self.sv_num_requests, "sv_num_requests.json")
@@ -139,36 +152,37 @@ def dump_json(self, obj, out_file):
                                                         out_file)))
 
     def start_analysis(self):
+        self.print_num_of_rows()
         self.check_crawl_history()
         self.run_all_streaming_analysis()
 
     def check_crawl_history(self):
-        """Compute failure and timeout rates for CrawlHistory table."""
+        """Compute failure and timeout rates for crawl_history table."""
         command_counts = {}  # num. of total commands by type
         fails = {}  # num. of failed commands grouped by cmd type
         timeouts = {}  # num. of timeouts
         for row in self.db_conn.execute(
             """SELECT command, count(*)
-                FROM CrawlHistory
+                FROM crawl_history
                 GROUP BY command;""").fetchall():
             command_counts[row["command"]] = row["count(*)"]
-            print "CrawlHistory Totals", row["command"], row["count(*)"]
+            print "crawl_history Totals", row["command"], row["count(*)"]
 
         for row in self.db_conn.execute(
             """SELECT command, count(*)
-                FROM CrawlHistory
+                FROM crawl_history
                 WHERE bool_success = 0
                 GROUP BY command;""").fetchall():
             fails[row["command"]] = row["count(*)"]
-            print "CrawlHistory Fails", row["command"], row["count(*)"]
+            print "crawl_history Fails", row["command"], row["count(*)"]
 
         for row in self.db_conn.execute(
             """SELECT command, count(*)
-                FROM CrawlHistory
+                FROM crawl_history
                 WHERE bool_success = -1
                 GROUP BY command;""").fetchall():
             timeouts[row["command"]] = row["count(*)"]
-            print "CrawlHistory Timeouts", row["command"], row["count(*)"]
+            print "crawl_history Timeouts", row["command"], row["count(*)"]
 
         for command in command_counts.keys():
             self.command_fail_rate[command] = (fails.get(command, 0) /
diff --git a/batch-process.sh b/batch-process.sh
@@ -1,26 +1,41 @@
 #!/bin/bash
-set -e
+#set -e
 
 CENSUS_LZ4_DATA_PATH="/mnt/10tb4/census_data_lz4"
 
+CENSUS_NORMALIZED_LZ4_DATA_PATH="/mnt/10tb4/census_data_lz4/normalized/"
+
 # We'll extract, process and delete each compressed crawl data
-EXTRACTION_DIR="/mnt/ssd/census_tmp"
+# EXTRACTION_DIR="/mnt/ssd/census_tmp"
+EXTRACTION_DIR="/tmp/census_tmp"
 
 function decompress_and_process(){
   ARCHIVE_BASE_NAME=$(basename "$1")
   CRAWL_NAME=${ARCHIVE_BASE_NAME/.tar.lz4/}
   CRAWL_DATA_PATH=$EXTRACTION_DIR/$CRAWL_NAME
   echo "Will extract $1 to $CRAWL_DATA_PATH"
-  time lz4 -dc --no-sparse $1 | tar xf - -C $EXTRACTION_DIR
-  time python process_crawl_data.py $CRAWL_DATA_PATH
-  # ls -l $EXTRACTION_DIR/*201*/201*.sqlite
-  # echo "Will vacuum the database"
-  # time sqlite3 $EXTRACTION_DIR/*201*/*201*.sqlite 'VACUUM;'
-  # ls -l $EXTRACTION_DIR/*201*/*201*.sqlite
-  echo "Will remove $EXTRACTION_DIR/201*"
+  time lz4 -qdc --no-sparse $1 | tar xf - -C $EXTRACTION_DIR
+  python process_crawl_data.py $CRAWL_DATA_PATH
+  echo "Size before vacuuming"
+  ls -hl $EXTRACTION_DIR/*201*/201*.sqlite
+  time sqlite3 $EXTRACTION_DIR/*201*/*201*.sqlite 'VACUUM;'
+  echo "Size after vacuuming"
+  ls -hl $EXTRACTION_DIR/*201*/*201*.sqlite
+  mkdir -p $CENSUS_NORMALIZED_LZ4_DATA_PATH/$2
+
+  OUT_NORMALIZED_ARCHIVE=$EXTRACTION_DIR/$ARCHIVE_BASE_NAME
+  pushd .
+  cd $EXTRACTION_DIR
+  tar c *201* | lz4 -zq - $OUT_NORMALIZED_ARCHIVE
+  popd
+  scp $OUT_NORMALIZED_ARCHIVE odin://mnt/10tb2/census-release-normalized/$2/
+  rm $OUT_NORMALIZED_ARCHIVE
+  echo "Will remove $EXTRACTION_DIR/*201*"
   rm -rf $EXTRACTION_DIR/*201*
+  # !!! retain the original archive
+  # rm $1
 }
 
-for crawl_archive_lz4 in $CENSUS_LZ4_DATA_PATH/*.tar.lz4
-  do decompress_and_process $crawl_archive_lz4
+for crawl_archive_lz4 in $CENSUS_LZ4_DATA_PATH/$1/*.tar.lz4
+  do decompress_and_process $crawl_archive_lz4 $1
 done;
diff --git a/db_schema.py b/db_schema.py
@@ -1,5 +1,5 @@
 # TODO:task and crawl have different, non-overlapping columns across versions.
-# xpath, site_visits, CrawlHistory, http_redirects has one version only
+# xpath, site_visits, crawl_history, http_redirects has one version only
 # flash cookies, profile_cookies has page_url/visit_id difference
 # content_policy, pages: no table
 
@@ -134,7 +134,7 @@
 JAVASCRIPT_TABLE = "javascript"
 JAVASCRIPT_COOKIES_TABLE = "javascript_cookies"
 SITE_VISITS_TABLE = "site_visits"
-CRAWL_HISTORY_TABLE = "CrawlHistory"
+CRAWL_HISTORY_TABLE = "crawl_history"
 CRAWL_TABLE = "crawl"
 TASK_TABLE = "task"
 HTTP_REQUESTS_PROXY_TABLE = "http_requests_proxy"
@@ -150,3 +150,19 @@
                  FLASH_COOKIES_TABLE: DB_SCHEMA_FLASH_COOKIES,
                  PROFILE_COOKIES_TABLE: DB_SCHEMA_PROFILE_COOKIES,
                  }
+
+OPENWPM_TABLES = [
+    HTTP_REQUESTS_TABLE,
+    HTTP_RESPONSES_TABLE,
+    JAVASCRIPT_TABLE,
+    JAVASCRIPT_COOKIES_TABLE,
+    SITE_VISITS_TABLE,
+    CRAWL_HISTORY_TABLE,
+    CRAWL_TABLE,
+    TASK_TABLE,
+    HTTP_REQUESTS_PROXY_TABLE,
+    HTTP_RESPONSES_PROXY_TABLE,
+    PROFILE_COOKIES_TABLE,
+    FLASH_COOKIES_TABLE,
+    LOCALSTORAGE_TABLE
+    ]
diff --git a/normalize_db.py b/normalize_db.py
@@ -11,6 +11,13 @@
                        JAVASCRIPT_TABLE, JAVASCRIPT_COOKIES_TABLE]
 
 
+def rename_crawl_history_table(con):
+    try:
+        con.execute("ALTER TABLE CrawlHistory RENAME TO crawl_history;")
+    except sqlite3.OperationalError:
+        pass
+
+
 def add_visit_id_col_to_tables(con):
     for table_name in TABLES_WITH_TOP_URL:
         try:
@@ -38,8 +45,9 @@ def add_site_visits_table(con):
     # See http://alweeam.com.sa in 2016-01_spider_4 for an example
     # The following query causes
     # query = "select DISTINCT top_url, MAX(crawl_id) from http_requests"
-    query = "SELECT top_url, MAX(crawl_id) FROM http_requests GROUP BY top_url"
-    for visit_id, (top_url, crawl_id) in enumerate(cur.execute(query)):
+    query = """SELECT top_url, MAX(crawl_id), MIN(id) as min_id FROM
+     http_requests GROUP BY top_url ORDER by min_id ASC"""
+    for visit_id, (top_url, crawl_id, _) in enumerate(cur.execute(query), 1):
         if not top_url:
             print "Warning: Empty top-url", top_url, crawl_id
         site_visits.append((visit_id, crawl_id, top_url))
@@ -83,8 +91,8 @@ def add_missing_columns(con, table_name, db_schema_str, site_url_visit_id_map):
     new_columns = get_column_names_from_create_query(
         TABLE_SCHEMAS[table_name])
     if new_columns == existing_columns:
-        print "No missing columns to add to", table_name
-        return
+        # print "No missing columns to add to", table_name
+        return False
     print "Will add missing columns to %s: %s" % (table_name, set(
         new_columns).difference(set(existing_columns)))
 
@@ -115,7 +123,7 @@ def add_missing_columns(con, table_name, db_schema_str, site_url_visit_id_map):
         cols_to_insert = common_columns + ["visit_id", ]
         stream_qry = "SELECT %s FROM %s " % (",".join(cols_to_select),
                                              tmp_table_name)
-        print "Will iterate over", stream_qry
+        # print "Will iterate over", stream_qry
         insert_qry = "INSERT INTO %s (%s) VALUES (%s)" % (
             table_name, ",".join(cols_to_insert),
             ",".join("?" * len(cols_to_insert)))
@@ -133,7 +141,7 @@ def add_missing_columns(con, table_name, db_schema_str, site_url_visit_id_map):
             # print "Will execute %s" % qry
             # con.execute(qry, row)
             processed += 1
-            if processed % 10000 == 0:
+            if processed % 100000 == 0:
                 con.executemany(insert_qry, data_to_insert)
                 del data_to_insert[:]
             print_progress(t0, processed, num_rows)
@@ -142,7 +150,7 @@ def add_missing_columns(con, table_name, db_schema_str, site_url_visit_id_map):
         # read from the temp table and write into the new table
         stream_qry = "SELECT %s FROM %s " % (",".join(common_columns),
                                              tmp_table_name)
-        print "Will iterate over", stream_qry
+        # print "Will iterate over", stream_qry
         insert_qry = "INSERT INTO %s (%s) VALUES (%s)" % (
                     table_name, ",".join(common_columns),
                     ",".join("?" * len(common_columns)))
@@ -151,7 +159,7 @@ def add_missing_columns(con, table_name, db_schema_str, site_url_visit_id_map):
             # print "Will execute %s" % qry
             # con.execute(insert_qry, row)
             processed += 1
-            if processed % 10000 == 0:
+            if processed % 100000 == 0:
                 con.executemany(insert_qry, data_to_insert)
                 del data_to_insert[:]
             print_progress(t0, processed, num_rows)
@@ -160,10 +168,8 @@ def add_missing_columns(con, table_name, db_schema_str, site_url_visit_id_map):
     print "Will drop the temp table",
     con.execute("DROP TABLE %s" % tmp_table_name)
     print "(took", time() - t0, "s)"
-    print "Will commit changes",
-    t0 = time()
     con.commit()
-    print "(took", time() - t0, "s)"
+    return True
 
 
 def get_column_names_from_create_query(create_table_query):
@@ -192,11 +198,11 @@ def add_missing_columns_to_all_tables(con, db_schema_str):
         # TODO: search in table names instead of the db schema
         if table_name in db_schema_str:
             t0 = time()
-            add_missing_columns(con, table_name, db_schema_str,
-                                site_url_visit_id_map)
-            duration = time() - t0
-            print "Took %s s to add missing columns to %s" % (duration,
-                                                              table_name)
+            if add_missing_columns(con, table_name, db_schema_str,
+                                   site_url_visit_id_map):
+                duration = time() - t0
+                print "Took %s s to add missing columns to %s" % (duration,
+                                                                  table_name)
 
 
 if __name__ == '__main__':
diff --git a/process_crawl_data.py b/process_crawl_data.py
@@ -1,13 +1,14 @@
 import sys
 import sqlite3
 import os
+from time import time
 from util import CRAWL_DB_EXT, get_table_and_column_names, load_alexa_ranks,\
     copy_if_not_exists
 from os.path import join, isfile, basename, isdir, dirname, sep
 import glob
 from normalize_db import add_site_visits_table, add_alexa_rank_to_site_visits,\
-    add_missing_columns_to_all_tables
-from db_schema import SITE_VISITS_TABLE
+    add_missing_columns_to_all_tables, rename_crawl_history_table
+from db_schema import SITE_VISITS_TABLE, CRAWL_HISTORY_TABLE
 from analyze_crawl import CrawlDBAnalysis
 
 ROOT_OUT_DIR = "/mnt/10tb4/census-release"
@@ -24,11 +25,9 @@
 CRONTAB_LOG_FILENAME = "crontab.log"
 ALEXA_TOP1M_CSV_FILENAME = "top-1m.csv"
 JAVASCRIPT_SRC_DIRNAME = "content.ldb"
-DEFAULT_SQLITE_CACHE_SIZE_GB = 3
+DEFAULT_SQLITE_CACHE_SIZE_GB = 16
 
-# Disable adding new columns for now
-# TODO: enable for the final runs
-ADD_MISSING_COLUMNS = False
+ADD_MISSING_COLUMNS = True
 
 
 class CrawlData(object):
@@ -65,8 +64,10 @@ def optimize_db(self, size_in_gb=DEFAULT_SQLITE_CACHE_SIZE_GB):
 
     def vacuum_db(self):
         """."""
-        print "Will vacuum the DB"
+        print "Will vacuum the DB",
+        t0 = time()
         self.db_conn.execute("VACUUM;")
+        print "finished in", float(time() - t0) / 60, "mins"
 
     def set_crawl_dir(self, crawl_dir):
         if isdir(crawl_dir):
@@ -117,6 +118,9 @@ def normalize_db(self):
         if SITE_VISITS_TABLE not in db_schema_str:
             print "Adding site_visits table"
             add_site_visits_table(self.db_conn)
+        if CRAWL_HISTORY_TABLE not in db_schema_str:
+            print "Renaming CrawlHistory table to crawl_history"
+            rename_crawl_history_table(self.db_conn)
         # Add site ranks to site_visits table
         if "site_rank" not in db_schema_str:
             if self.alexa_csv_path:
@@ -127,6 +131,7 @@ def normalize_db(self):
                 print "Missing Alexa ranks CSV, can't add ranks to site_visits"
         if ADD_MISSING_COLUMNS:
             add_missing_columns_to_all_tables(self.db_conn, db_schema_str)
+        print "Will commit the changes"
         self.db_conn.commit()
 
     def dump_db_schema(self):
@@ -158,8 +163,12 @@ def backup_crawl_files(self):
 
 
 if __name__ == '__main__':
+    t0 = time()
     crawl_data = CrawlData(sys.argv[1])
     crawl_data.pre_process()
+    t1 = time()
+    print "Preprocess finished in", float(t1 - t0) / 60, "mins"
     analysis = CrawlDBAnalysis(crawl_data.crawl_db_path, ANALYSIS_OUT_DIR,
                                crawl_data.crawl_name)
     analysis.start_analysis()
+    print "Analysis finished in", float(time() - t1) / 60, "mins"