Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tiempo de inactividad/ cuanto duerme #35

Closed
wants to merge 22 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,27 @@ pnpm dev

anda a `/admin` y logeate con la contraseña. agrega una cuenta de Twitter copiando las cookies `auth_token` y `ct0`.

los scraps en dev corren solo una vez, en prod corren cada ~1 minuto.
### scraper

podes correrlo manualmente:

```
pnpm scraper:run scrap retweets
pnpm scraper:run scrap likes
# y fijate con --help porque hay flags para debuggear, etc
```

o lo que se usa en prod que es el cron:

```
pnpm scraper:run cron
```

## producción

```
git pull && pnpm install && pnpm build && cp -r drizzle build/
node -r dotenv/config build
# en otra tty
pnpm scraper:run cron
```
14 changes: 14 additions & 0 deletions devbox.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"packages": [
"nodejs@latest",
"nodePackages.pnpm@latest",
"git-lfs@latest",
"sqlite@latest"
],
"shell": {
"init_hook": ["echo 'Welcome to devbox!' > /dev/null"],
"scripts": {
"dev": ["pnpm dev"]
}
}
}
85 changes: 85 additions & 0 deletions devbox.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
{
"lockfile_version": "1",
"packages": {
"git-lfs@latest": {
"last_modified": "2024-02-10T18:15:24Z",
"resolved": "github:NixOS/nixpkgs/10b813040df67c4039086db0f6eaf65c536886c6#git-lfs",
"source": "devbox-search",
"version": "3.4.1",
"systems": {
"aarch64-darwin": {
"store_path": "/nix/store/a6paw80iqyaa3s56afd7lx41kh4j7rm2-git-lfs-3.4.1"
},
"aarch64-linux": {
"store_path": "/nix/store/6prz36drh0sa8ir9ns5qx2ca582009a1-git-lfs-3.4.1"
},
"x86_64-darwin": {
"store_path": "/nix/store/ds3g9yds887fabw77mxi6wfgl9rizmn3-git-lfs-3.4.1"
},
"x86_64-linux": {
"store_path": "/nix/store/gla65g2rjqy51v3ydcg2rdxs6ij2zfdj-git-lfs-3.4.1"
}
}
},
"nodePackages.pnpm@latest": {
"last_modified": "2024-02-15T12:53:33Z",
"resolved": "github:NixOS/nixpkgs/085589047343aad800c4d305cf7b98e8a3d51ae2#nodePackages.pnpm",
"source": "devbox-search",
"version": "8.15.1",
"systems": {
"aarch64-darwin": {
"store_path": "/nix/store/ygq6gz5ayv52vfh7vkyyklfcd5w1f277-pnpm-8.15.1"
},
"aarch64-linux": {
"store_path": "/nix/store/hq4ia4477y09v544ix80adwqb4xp7h6r-pnpm-8.15.1"
},
"x86_64-darwin": {
"store_path": "/nix/store/vdmvnyvrmrv74vdisw1n8r3rjll29v07-pnpm-8.15.1"
},
"x86_64-linux": {
"store_path": "/nix/store/ihf1h9ck14xdfaqm4qh2ha3bdgfrqs3b-pnpm-8.15.1"
}
}
},
"nodejs@latest": {
"last_modified": "2024-02-15T12:53:33Z",
"resolved": "github:NixOS/nixpkgs/085589047343aad800c4d305cf7b98e8a3d51ae2#nodejs_21",
"source": "devbox-search",
"version": "21.6.2",
"systems": {
"aarch64-darwin": {
"store_path": "/nix/store/g1v8xrnj54xyvn6gf73mnw9pdwrdzmdi-nodejs-21.6.2"
},
"aarch64-linux": {
"store_path": "/nix/store/7ywds3cnww6ahc45g28c3c2rnf2aky2s-nodejs-21.6.2"
},
"x86_64-darwin": {
"store_path": "/nix/store/scbarnsw7m2g4j2m6rp5c82wyv1fzg8l-nodejs-21.6.2"
},
"x86_64-linux": {
"store_path": "/nix/store/6cip7ljx563rlyfnzwkfzf6gxpy1lid0-nodejs-21.6.2"
}
}
},
"sqlite@latest": {
"last_modified": "2024-02-20T22:56:03Z",
"resolved": "github:NixOS/nixpkgs/5eeded8e3518579daa13887297efa79f5be74b41#sqlite",
"source": "devbox-search",
"version": "3.45.1",
"systems": {
"aarch64-darwin": {
"store_path": "/nix/store/ay9wvqp3p548p08symcy3q7avm86ck00-sqlite-3.45.1-bin"
},
"aarch64-linux": {
"store_path": "/nix/store/1j3xiiq0i2q69nmwi9ja99grnwlpqzgy-sqlite-3.45.1-bin"
},
"x86_64-darwin": {
"store_path": "/nix/store/c9wndvymfnm4dcmi0bgy1lmljzx8dv4n-sqlite-3.45.1-bin"
},
"x86_64-linux": {
"store_path": "/nix/store/b3pbwqqpfllg2lxwhhv4iyw3m9xxbgld-sqlite-3.45.1-bin"
}
}
}
}
}
95 changes: 52 additions & 43 deletions scraper/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ const scrapLikesCommand = command({
async handler({ n }) {
const db = await connectDb(process.env.DB_PATH);
const scraper = new Scraper(db);
const cuenta = await this.getRandomAccount();
const cuenta = await scraper.getRandomAccount();
await scraper.scrap(cuenta, n);
await scraper.browser?.close();
},
Expand Down Expand Up @@ -240,6 +240,7 @@ class Scraper {
totalTweetsSeen: count,
})
.where(eq(schema.scraps.id, scrapId));
return count;
} catch (error) {
console.error(`oneoff[${cuenta?.id}]:`, error);
} finally {
Expand All @@ -254,12 +255,16 @@ class Scraper {
let i = 0;
while (true) {
const cuenta = await this.getRandomAccount();
await this.scrap(cuenta);
{
const count = await this.scrap(cuenta);
if (count) console.info(`scrapped likes, seen ${count}`);
}
i--;
if (i <= 0) {
try {
const result = await this.scrapTweets({ cuenta });
await this.saveTweetsScrap(result);
console.info(`scrapped retweets, seen ${result.tweetsSeen}`);
} catch (error) {
console.error(`tweets[${cuenta.id}]:`, error);
}
Expand All @@ -285,7 +290,7 @@ class Scraper {
* @returns la cantidad de tweets vistos (no guardados)
*/
async scrapTweets({
n = 5,
n = 10,
saveApiResponses = false,
cuenta,
}: {
Expand All @@ -302,48 +307,52 @@ class Scraper {

// setup API response capturing and parsing
page.on("response", async (response) => {
const req = response.request();
const url = req.url();
if (
url.includes("/graphql/") &&
url.includes("UserTweets?") &&
req.method() === "GET"
) {
const json = await response.json();
if (saveApiResponses) {
await mkdir("debug-api-responses", { recursive: true });
await writeFile(
`debug-api-responses/${+new Date()}-${nanoid()}.json`,
JSON.stringify(json, undefined, 2),
);
}

const parsed = zUserTweetsRes.parse(json);
const entries =
parsed.data.user.result.timeline_v2.timeline.instructions
.filter(
(x): x is z.infer<typeof zTimelineAddEntriesEntry> =>
"entries" in x,
)
.flatMap((x) =>
x.entries
.map((e) => e.content)
.filter(
(y): y is TimelineTimelineItem =>
y.entryType === "TimelineTimelineItem",
),
)
// filtrar publicidades
.filter((e) => !e.itemContent.tweet_results.promotedMetadata)
.map((e) => e.itemContent.tweet_results.result)
// filtrar publicidades
.filter(
(e): e is z.infer<typeof zUserTweetsTweetResultTweet> =>
e.__typename === "Tweet",
try {
const req = response.request();
const url = req.url();
if (
url.includes("/graphql/") &&
url.includes("UserTweets?") &&
req.method() === "GET"
) {
const json = await response.json();
if (saveApiResponses) {
await mkdir("debug-api-responses", { recursive: true });
await writeFile(
`debug-api-responses/${+new Date()}-${nanoid()}.json`,
JSON.stringify(json, undefined, 2),
);
for (const entry of entries) {
map.set(entry.legacy.id_str, entry.legacy);
}

const parsed = zUserTweetsRes.parse(json);
const entries =
parsed.data.user.result.timeline_v2.timeline.instructions
.filter(
(x): x is z.infer<typeof zTimelineAddEntriesEntry> =>
"entries" in x,
)
.flatMap((x) =>
x.entries
.map((e) => e.content)
.filter(
(y): y is TimelineTimelineItem =>
y.entryType === "TimelineTimelineItem",
),
)
// filtrar publicidades
.filter((e) => !e.itemContent.tweet_results.promotedMetadata)
.map((e) => e.itemContent.tweet_results.result)
// filtrar publicidades
.filter(
(e): e is z.infer<typeof zUserTweetsTweetResultTweet> =>
e.__typename === "Tweet",
);
for (const entry of entries) {
map.set(entry.legacy.id_str, entry.legacy);
}
}
} catch (error) {
console.error(`no pude capturar pedido API`, error);
}
});

Expand Down
21 changes: 21 additions & 0 deletions src/lib/data-processing/mostLiked.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import type { LikedTweet } from "../../schema";

export function getUsernameFromUrl(url: string): string | null {
const matches = url.match(/^https:\/\/twitter.com\/(.+?)\//);
if (!matches) return null;
const [, username] = matches;
return username;
}

export function sortMost(tweets: LikedTweet[]) {
const map = new Map<string, number>();
for (const tweet of tweets) {
const username = getUsernameFromUrl(tweet.url);
if (!username) continue;
map.set(username, (map.get(username) ?? 0) + 1);
}
return Array.from(map)
.filter(([, n]) => n > 3)
.sort(([, a], [, b]) => b - a)
.slice(0, 10);
}
86 changes: 86 additions & 0 deletions src/lib/data-processing/screenTime.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import type { Dayjs } from "dayjs";
import type { LikedTweet } from "../../schema";
import dayjs from "dayjs";
import { formatDuration, intervalToDuration } from "date-fns";
import { es } from "date-fns/locale";

export type Duration = { start: Dayjs; end: Dayjs };
export function calculateScreenTime(tweets: LikedTweet[]): Duration[] {
const n = 3;
const durations = tweets
.map((t) => dayjs(t.firstSeenAt))
.map((d) => ({ start: d, end: d.add(n, "minute") }));

type StartEnd = {
type: "start" | "end";
date: Dayjs;
};
const startEnds: Array<StartEnd> = durations
.flatMap<StartEnd>(({ start, end }) => [
{ type: "start", date: start },
{ type: "end", date: end },
])
.sort(({ date: a }, { date: b }) => a.diff(b));

// console.debug(startEnds.map((x) => [x.type, x.date.toDate()]));

let finalStartEnds: Array<StartEnd> = [];

// https://stackoverflow.com/questions/45109429/merge-sets-of-overlapping-time-periods-into-new-one
let i = 0;
for (const startEnd of startEnds) {
if (startEnd.type === "start") {
i++;
if (i === 1) finalStartEnds.push(startEnd);
} else {
if (i === 1) finalStartEnds.push(startEnd);
i--;
}
}
// console.debug(finalStartEnds.map((x) => [x.type, x.date.toDate()]));

let finalDurations: Array<Duration> = [];

while (finalStartEnds.length > 0) {
const [start, end] = finalStartEnds.splice(0, 2);
if (start.type !== "start") throw new Error("expected start");
if (end.type !== "end") throw new Error("expected end");
finalDurations.push({
start: start.date,
end: end.date.subtract(n, "minute").add(2, "minute"),
});
}
return finalDurations;
}

/**
* @returns number - en milisegundos
*/
export function totalFromDurations(durations: Duration[]): number {
let total = 0;
for (const duration of durations) {
const time = duration.end.diff(duration.start);
total += time;
}
return total;
}

// https://stackoverflow.com/a/65711327
export function formatDurationFromMs(ms: number) {
const duration = intervalToDuration({ start: 0, end: ms });
return formatDuration(duration, {
locale: es,
delimiter: ", ",
format: ["hours", "minutes"],
});
}
export function formatTinyDurationFromMs(ms: number) {
const duration = intervalToDuration({ start: 0, end: ms });
// https://github.com/date-fns/date-fns/issues/2134
return formatDuration(duration, {
locale: es,
format: ["hours", "minutes"],
})
.replace(/ horas?/, "h")
.replace(/ minutos?/, "m");
}
Loading