Skip to content

Commit deea907

Browse files
authored
fix: github data (#13)
1 parent 013b2ee commit deea907

File tree

11 files changed

+1334
-616
lines changed

11 files changed

+1334
-616
lines changed

.github/workflows/get-data.yml

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
11
name: Fetch Contribution Data 🛠️
22

33
on:
4-
# push:
5-
# branches: [ main ]
6-
# pull_request:
7-
# branches: [ main ]
4+
push:
5+
branches: [main]
86
workflow_dispatch:
97
schedule:
108
- cron: "0 0 * * 1" # Runs at 00:00 UTC on Monday
@@ -28,12 +26,12 @@ jobs:
2826

2927
steps:
3028
- name: Checkout repository
31-
uses: actions/checkout@v3
29+
uses: actions/checkout@v4
3230

3331
- name: Set up Node.js
34-
uses: actions/setup-node@v3
32+
uses: actions/setup-node@v4
3533
with:
36-
node-version: "21"
34+
node-version: "22"
3735

3836
- name: Install dependencies
3937
run: yarn install
@@ -47,7 +45,7 @@ jobs:
4745
done
4846
4947
- name: Configure AWS credentials
50-
uses: aws-actions/configure-aws-credentials@v1
48+
uses: aws-actions/configure-aws-credentials@v4
5149
with:
5250
aws-access-key-id: ${{ secrets.AWS_FE_DEPLOYER_ACCESS }}
5351
aws-secret-access-key: ${{ secrets.AWS_FE_DEPLOYER_SECRET }}
@@ -63,17 +61,28 @@ jobs:
6361
6462
- name: Run it 🚀
6563
run: |
64+
echo "Building project..."
6665
yarn build
66+
67+
echo "Setting up database schemas..."
6768
cd ./packages/stats-db
68-
./scripts/schema.sh
69+
./scripts/schema.sh -s npm
70+
./scripts/schema.sh -s github
6971
72+
echo "Fetching NPM data..."
7073
yarn npm:fetch:packages
7174
yarn npm:fetch:downloads
7275
yarn npm:report
7376
yarn npm:badges
7477
yarn npm:readme
78+
79+
echo "Fetching GitHub data..."
80+
yarn gh:fetch
81+
82+
echo "Uploading database dump to S3..."
83+
yarn db:dump:s3
7584
env:
76-
# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
85+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
7786
DATABASE_URL: postgres://postgres:password@localhost:5432/example_db
7887

7988
# AWS S3 Configuration
@@ -89,8 +98,8 @@ jobs:
8998
with:
9099
name: lib-count-badges
91100
path: |
92-
badges/lib-count/total_downloads.json
93-
badges/lib-count/monthly_downloads.json
94-
badges/lib-count/utils_category.json
95-
badges/lib-count/launchql_category.json
96-
badges/lib-count/hyperweb_category.json
101+
output/badges/lib-count/total_downloads.json
102+
output/badges/lib-count/monthly_downloads.json
103+
output/badges/lib-count/utils_category.json
104+
output/badges/lib-count/launchql_category.json
105+
output/badges/lib-count/hyperweb_category.json

packages/stats-db/README.md

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,3 +106,53 @@ To index from scratch, follow these steps in order:
106106
```sh
107107
yarn npm:report && yarn npm:badges && yarn npm:readme
108108
```
109+
110+
# GitHub Analytics
111+
112+
## **Project Overview**
113+
114+
A TypeScript-based tool for collecting GitHub ecosystem data to map contributor networks and organizational relationships within the Cosmos blockchain ecosystem.
115+
116+
## **Data Collection Requirements**
117+
118+
### **1. Repository Collection**
119+
120+
- **Target Organizations**: `hyperweb-io` and `launchql`
121+
- **Repository Filter**: Collect only non-fork repositories from each organization
122+
- **Repository Data**:
123+
- Repository ID, name, and full name
124+
- HTML URL and privacy status
125+
- Fork status (to enable filtering)
126+
127+
### **2. Contributor Collection**
128+
129+
- **Scope**: All contributors to all non-fork repositories collected in step 1
130+
- **Contributor Data**:
131+
- GitHub username (login)
132+
- User ID
133+
- Contribution count per repository
134+
- Total contributions across all repositories
135+
136+
### **3. Organization Network Discovery**
137+
138+
- **Scope**: All public organizations that any contributor (from step 2) belongs to
139+
- **Organization Data**:
140+
- Organization login/name
141+
- Organization API URL
142+
- Unique organization list (deduplicated across all contributors)
143+
144+
### **Data Collection Flow**
145+
146+
1. Fetch all repositories from `hyperweb-io` and `launchql` organizations
147+
2. Filter out forked repositories, keeping only original repositories
148+
3. For each non-fork repository, fetch complete contributor list
149+
4. For each unique contributor discovered, fetch their public organization memberships
150+
5. Aggregate and deduplicate all discovered organizations
151+
152+
### **Output Requirements**
153+
154+
- **Non-fork repositories**: Organized by parent organization
155+
- **Contributor profiles**: Including cross-repository contribution mapping
156+
- **Organization network**: Complete deduplicated list of all public organizations discovered through contributor analysis
157+
158+
This data collection strategy enables comprehensive ecosystem analysis by mapping the full network of organizations connected through shared contributors in the target GitHub organizations.

packages/stats-db/package.json

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@
3030
"npm:report": "DATABASE_URL=postgres://postgres:password@localhost:5432/example_db ts-node ./src/tasks/npm/npm.tasks.ts generate:report",
3131
"npm:badges": "DATABASE_URL=postgres://postgres:password@localhost:5432/example_db ts-node ./src/tasks/npm/npm.tasks.ts generate:badges",
3232
"npm:readme": "DATABASE_URL=postgres://postgres:password@localhost:5432/example_db ts-node ./src/tasks/npm/npm.tasks.ts generate:readme",
33-
"gh:fetch": "ts-node src/tasks/github/github.tasks.ts fetch",
34-
"gh:report": "ts-node src/tasks/github/github.tasks.ts report",
35-
"gh:export": "ts-node src/tasks/github/github.tasks.ts export",
33+
"gh:fetch": "DATABASE_URL=postgres://postgres:password@localhost:5432/example_db ts-node src/tasks/github/github.tasks.ts fetch",
34+
"gh:report": "DATABASE_URL=postgres://postgres:password@localhost:5432/example_db ts-node src/tasks/github/github.tasks.ts report",
35+
"gh:export": "DATABASE_URL=postgres://postgres:password@localhost:5432/example_db ts-node src/tasks/github/github.tasks.ts export",
3636
"gh:analyze": "ts-node src/tasks/github/analyze-repo.ts",
3737
"db:dump": "ts-node ./src/dump.ts --no-upload",
3838
"db:dump:s3": "ts-node ./src/dump.ts",
@@ -48,6 +48,8 @@
4848
"@interweb/fetch-api-client": "^0.6.0",
4949
"@interweb/http-errors": "^0.1.0",
5050
"@octokit/graphql": "^8.2.1",
51+
"@octokit/plugin-retry": "^8.0.1",
52+
"@octokit/plugin-throttling": "^11.0.1",
5153
"@octokit/rest": "^21.1.1",
5254
"@types/pg": "^8.11.10",
5355
"dotenv": "^16.4.7",

packages/stats-db/scripts/github.sql

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,24 @@ CREATE TABLE github.author (
2828
login text NOT NULL,
2929
name text,
3030
avatar_url text,
31+
primary_email text, -- Most frequently used email from commits
3132
created_at timestamp with time zone DEFAULT CURRENT_TIMESTAMP,
3233
updated_at timestamp with time zone DEFAULT CURRENT_TIMESTAMP
3334
);
3435

36+
-- Create author emails table to track all emails used by contributors
37+
CREATE TABLE github.author_email (
38+
id uuid PRIMARY KEY DEFAULT uuid_generate_v4(),
39+
author_id uuid NOT NULL REFERENCES github.author(id),
40+
email text NOT NULL,
41+
commit_count integer NOT NULL DEFAULT 1, -- How many commits used this email
42+
first_seen_at timestamp with time zone NOT NULL,
43+
last_seen_at timestamp with time zone NOT NULL,
44+
created_at timestamp with time zone DEFAULT CURRENT_TIMESTAMP,
45+
updated_at timestamp with time zone DEFAULT CURRENT_TIMESTAMP,
46+
UNIQUE (author_id, email)
47+
);
48+
3549
-- Create the repositories table
3650
CREATE TABLE github.repository (
3751
id uuid PRIMARY KEY DEFAULT uuid_generate_v4(),
@@ -42,6 +56,10 @@ CREATE TABLE github.repository (
4256
url text NOT NULL,
4357
is_fork boolean NOT NULL DEFAULT false,
4458
fork_date timestamp with time zone,
59+
parent_repo text, -- Full name of parent repository (e.g., "owner/repo")
60+
source_repo text, -- Full name of ultimate source repository if different from parent
61+
fork_detection_method text, -- 'github_api', 'known_forks', 'commit_analysis', 'name_similarity', 'manual_verification'
62+
fork_detection_confidence text, -- 'high', 'medium', 'low'
4563
owner_id uuid NOT NULL REFERENCES github.organization(id),
4664
stars_count integer NOT NULL DEFAULT 0,
4765
forks_count integer NOT NULL DEFAULT 0,
@@ -104,9 +122,15 @@ CREATE TABLE github.contribution_summary (
104122
-- Create indexes for better query performance
105123
CREATE INDEX idx_repository_owner ON github.repository(owner_id);
106124
CREATE INDEX idx_repository_fork_date ON github.repository(fork_date) WHERE fork_date IS NOT NULL;
125+
CREATE INDEX idx_repository_parent_repo ON github.repository(parent_repo) WHERE parent_repo IS NOT NULL;
126+
CREATE INDEX idx_repository_source_repo ON github.repository(source_repo) WHERE source_repo IS NOT NULL;
127+
CREATE INDEX idx_repository_fork_detection ON github.repository(fork_detection_method, fork_detection_confidence) WHERE is_fork = true;
107128
CREATE INDEX idx_daily_contribution_repo_date ON github.daily_contribution(repository_id, date);
108129
CREATE INDEX idx_daily_contribution_author_date ON github.daily_contribution(author_id, date);
109130
CREATE INDEX idx_author_org_history_dates ON github.author_organization_history(author_id, organization_id, joined_at);
131+
CREATE INDEX idx_author_email_author ON github.author_email(author_id);
132+
CREATE INDEX idx_author_email_email ON github.author_email(email);
133+
CREATE INDEX idx_author_email_commit_count ON github.author_email(author_id, commit_count DESC);
110134

111135
-- Add indexes for org connection queries
112136
CREATE INDEX idx_org_connection_source ON github.organization_connection(source_org_id);

packages/stats-db/src/tasks/github/Untitled-1

Lines changed: 0 additions & 13 deletions
This file was deleted.

0 commit comments

Comments
 (0)