|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +""" |
| 4 | +TODO: remove invalid emails, at the very least those that contain newlines or <>, |
| 5 | +which make fsck fail. Those must have been pushed on a previous version of GitHub. |
| 6 | +
|
| 7 | +A repository with one commit per commit email: |
| 8 | +https://github.com/cirosantilli/all-github-commit-emails |
| 9 | +
|
| 10 | +As of 2015-12-31 , the generated repo is just under 1Gb, |
| 11 | +so under GitHub's soft max repo size limit. |
| 12 | +
|
| 13 | + remote='[email protected]:cirosantilli/imagine-all-the-people.git' |
| 14 | + for i in `seq 10 10 100`; do |
| 15 | + git --git-dir=repo.tmp/.git push -f "$remote" "$i:master" |
| 16 | + done |
| 17 | + # TODO for some reason I needed this afterwards. |
| 18 | + git --git-dir=tmp/repo.tmp/.git push "$remote" 'master' |
| 19 | +""" |
| 20 | + |
| 21 | +import datetime |
| 22 | +import subprocess |
| 23 | +import time |
| 24 | +import os |
| 25 | + |
| 26 | +import util |
| 27 | + |
| 28 | +# data_dir_path = '/path/to/all-github-commit-emails/emails' |
| 29 | +data_dir_path = '/home/ciro/bak/git/all-github-commit-emails/emails' |
| 30 | + |
| 31 | +name = b'a' |
| 32 | + |
| 33 | +util.init() |
| 34 | + |
| 35 | +tree = util.create_tree_with_one_file() |
| 36 | +commit = None |
| 37 | +n = 1000000 |
| 38 | +percent = (n / 100) |
| 39 | +p = 0 |
| 40 | +i = 0 |
| 41 | + |
| 42 | +data_paths = sorted(os.listdir(data_dir_path)) |
| 43 | +for data_path in data_paths: |
| 44 | + data_path = os.path.join(data_dir_path, data_path) |
| 45 | + with open(data_path, 'rb') as f: |
| 46 | + for line in f: |
| 47 | + email = line.rstrip()[:255] |
| 48 | + commit, _, _ = util.save_commit_object( |
| 49 | + tree, |
| 50 | + (commit,), |
| 51 | + author_email=email, |
| 52 | + committer_email=email, |
| 53 | + ) |
| 54 | + if i % percent == 0: |
| 55 | + print(p) |
| 56 | + print(email) |
| 57 | + print(datetime.datetime.now()) |
| 58 | + p += 1 |
| 59 | + |
| 60 | + # Lose objects are too large and blow up the tmpfs. |
| 61 | + |
| 62 | + # Does clean packets, but the calculation takes more and more memory, |
| 63 | + # and slows down and blows up at the end. TODO which subcommand blows up eactly?. |
| 64 | + #subprocess.check_output(['git', 'gc']) |
| 65 | + |
| 66 | + subprocess.check_output(['git', 'repack']) |
| 67 | + subprocess.check_output(['git', 'prune-packed']) |
| 68 | + |
| 69 | + subprocess.check_output(['git', 'tag', str(p), commit]) |
| 70 | + i += 1 |
| 71 | + if i == 100000: |
| 72 | + util.create_master(commit) |
0 commit comments