diff --git a/README.md b/README.md index 291e8b6..8046781 100644 --- a/README.md +++ b/README.md @@ -12,11 +12,27 @@ pnpm dev anda a `/admin` y logeate con la contraseña. agrega una cuenta de Twitter copiando las cookies `auth_token` y `ct0`. -los scraps en dev corren solo una vez, en prod corren cada ~1 minuto. +### scraper + +podes correrlo manualmente: + +``` +pnpm scraper:run scrap retweets +pnpm scraper:run scrap likes +# y fijate con --help porque hay flags para debuggear, etc +``` + +o lo que se usa en prod que es el cron: + +``` +pnpm scraper:run cron +``` ## producción ``` git pull && pnpm install && pnpm build && cp -r drizzle build/ node -r dotenv/config build +# en otra tty +pnpm scraper:run cron ``` diff --git a/devbox.json b/devbox.json new file mode 100644 index 0000000..b272a94 --- /dev/null +++ b/devbox.json @@ -0,0 +1,14 @@ +{ + "packages": [ + "nodejs@latest", + "nodePackages.pnpm@latest", + "git-lfs@latest", + "sqlite@latest" + ], + "shell": { + "init_hook": ["echo 'Welcome to devbox!' > /dev/null"], + "scripts": { + "dev": ["pnpm dev"] + } + } +} diff --git a/devbox.lock b/devbox.lock new file mode 100644 index 0000000..469c790 --- /dev/null +++ b/devbox.lock @@ -0,0 +1,85 @@ +{ + "lockfile_version": "1", + "packages": { + "git-lfs@latest": { + "last_modified": "2024-02-10T18:15:24Z", + "resolved": "github:NixOS/nixpkgs/10b813040df67c4039086db0f6eaf65c536886c6#git-lfs", + "source": "devbox-search", + "version": "3.4.1", + "systems": { + "aarch64-darwin": { + "store_path": "/nix/store/a6paw80iqyaa3s56afd7lx41kh4j7rm2-git-lfs-3.4.1" + }, + "aarch64-linux": { + "store_path": "/nix/store/6prz36drh0sa8ir9ns5qx2ca582009a1-git-lfs-3.4.1" + }, + "x86_64-darwin": { + "store_path": "/nix/store/ds3g9yds887fabw77mxi6wfgl9rizmn3-git-lfs-3.4.1" + }, + "x86_64-linux": { + "store_path": "/nix/store/gla65g2rjqy51v3ydcg2rdxs6ij2zfdj-git-lfs-3.4.1" + } + } + }, + "nodePackages.pnpm@latest": { + "last_modified": "2024-02-15T12:53:33Z", + "resolved": "github:NixOS/nixpkgs/085589047343aad800c4d305cf7b98e8a3d51ae2#nodePackages.pnpm", + "source": "devbox-search", + "version": "8.15.1", + "systems": { + "aarch64-darwin": { + "store_path": "/nix/store/ygq6gz5ayv52vfh7vkyyklfcd5w1f277-pnpm-8.15.1" + }, + "aarch64-linux": { + "store_path": "/nix/store/hq4ia4477y09v544ix80adwqb4xp7h6r-pnpm-8.15.1" + }, + "x86_64-darwin": { + "store_path": "/nix/store/vdmvnyvrmrv74vdisw1n8r3rjll29v07-pnpm-8.15.1" + }, + "x86_64-linux": { + "store_path": "/nix/store/ihf1h9ck14xdfaqm4qh2ha3bdgfrqs3b-pnpm-8.15.1" + } + } + }, + "nodejs@latest": { + "last_modified": "2024-02-15T12:53:33Z", + "resolved": "github:NixOS/nixpkgs/085589047343aad800c4d305cf7b98e8a3d51ae2#nodejs_21", + "source": "devbox-search", + "version": "21.6.2", + "systems": { + "aarch64-darwin": { + "store_path": "/nix/store/g1v8xrnj54xyvn6gf73mnw9pdwrdzmdi-nodejs-21.6.2" + }, + "aarch64-linux": { + "store_path": "/nix/store/7ywds3cnww6ahc45g28c3c2rnf2aky2s-nodejs-21.6.2" + }, + "x86_64-darwin": { + "store_path": "/nix/store/scbarnsw7m2g4j2m6rp5c82wyv1fzg8l-nodejs-21.6.2" + }, + "x86_64-linux": { + "store_path": "/nix/store/6cip7ljx563rlyfnzwkfzf6gxpy1lid0-nodejs-21.6.2" + } + } + }, + "sqlite@latest": { + "last_modified": "2024-02-20T22:56:03Z", + "resolved": "github:NixOS/nixpkgs/5eeded8e3518579daa13887297efa79f5be74b41#sqlite", + "source": "devbox-search", + "version": "3.45.1", + "systems": { + "aarch64-darwin": { + "store_path": "/nix/store/ay9wvqp3p548p08symcy3q7avm86ck00-sqlite-3.45.1-bin" + }, + "aarch64-linux": { + "store_path": "/nix/store/1j3xiiq0i2q69nmwi9ja99grnwlpqzgy-sqlite-3.45.1-bin" + }, + "x86_64-darwin": { + "store_path": "/nix/store/c9wndvymfnm4dcmi0bgy1lmljzx8dv4n-sqlite-3.45.1-bin" + }, + "x86_64-linux": { + "store_path": "/nix/store/b3pbwqqpfllg2lxwhhv4iyw3m9xxbgld-sqlite-3.45.1-bin" + } + } + } + } +} diff --git a/scraper/index.ts b/scraper/index.ts index 458ac45..105cbdf 100644 --- a/scraper/index.ts +++ b/scraper/index.ts @@ -19,7 +19,7 @@ const scrapLikesCommand = command({ async handler({ n }) { const db = await connectDb(process.env.DB_PATH); const scraper = new Scraper(db); - const cuenta = await this.getRandomAccount(); + const cuenta = await scraper.getRandomAccount(); await scraper.scrap(cuenta, n); await scraper.browser?.close(); }, @@ -240,6 +240,7 @@ class Scraper { totalTweetsSeen: count, }) .where(eq(schema.scraps.id, scrapId)); + return count; } catch (error) { console.error(`oneoff[${cuenta?.id}]:`, error); } finally { @@ -254,12 +255,16 @@ class Scraper { let i = 0; while (true) { const cuenta = await this.getRandomAccount(); - await this.scrap(cuenta); + { + const count = await this.scrap(cuenta); + if (count) console.info(`scrapped likes, seen ${count}`); + } i--; if (i <= 0) { try { const result = await this.scrapTweets({ cuenta }); await this.saveTweetsScrap(result); + console.info(`scrapped retweets, seen ${result.tweetsSeen}`); } catch (error) { console.error(`tweets[${cuenta.id}]:`, error); } @@ -285,7 +290,7 @@ class Scraper { * @returns la cantidad de tweets vistos (no guardados) */ async scrapTweets({ - n = 5, + n = 10, saveApiResponses = false, cuenta, }: { @@ -302,48 +307,52 @@ class Scraper { // setup API response capturing and parsing page.on("response", async (response) => { - const req = response.request(); - const url = req.url(); - if ( - url.includes("/graphql/") && - url.includes("UserTweets?") && - req.method() === "GET" - ) { - const json = await response.json(); - if (saveApiResponses) { - await mkdir("debug-api-responses", { recursive: true }); - await writeFile( - `debug-api-responses/${+new Date()}-${nanoid()}.json`, - JSON.stringify(json, undefined, 2), - ); - } - - const parsed = zUserTweetsRes.parse(json); - const entries = - parsed.data.user.result.timeline_v2.timeline.instructions - .filter( - (x): x is z.infer => - "entries" in x, - ) - .flatMap((x) => - x.entries - .map((e) => e.content) - .filter( - (y): y is TimelineTimelineItem => - y.entryType === "TimelineTimelineItem", - ), - ) - // filtrar publicidades - .filter((e) => !e.itemContent.tweet_results.promotedMetadata) - .map((e) => e.itemContent.tweet_results.result) - // filtrar publicidades - .filter( - (e): e is z.infer => - e.__typename === "Tweet", + try { + const req = response.request(); + const url = req.url(); + if ( + url.includes("/graphql/") && + url.includes("UserTweets?") && + req.method() === "GET" + ) { + const json = await response.json(); + if (saveApiResponses) { + await mkdir("debug-api-responses", { recursive: true }); + await writeFile( + `debug-api-responses/${+new Date()}-${nanoid()}.json`, + JSON.stringify(json, undefined, 2), ); - for (const entry of entries) { - map.set(entry.legacy.id_str, entry.legacy); + } + + const parsed = zUserTweetsRes.parse(json); + const entries = + parsed.data.user.result.timeline_v2.timeline.instructions + .filter( + (x): x is z.infer => + "entries" in x, + ) + .flatMap((x) => + x.entries + .map((e) => e.content) + .filter( + (y): y is TimelineTimelineItem => + y.entryType === "TimelineTimelineItem", + ), + ) + // filtrar publicidades + .filter((e) => !e.itemContent.tweet_results.promotedMetadata) + .map((e) => e.itemContent.tweet_results.result) + // filtrar publicidades + .filter( + (e): e is z.infer => + e.__typename === "Tweet", + ); + for (const entry of entries) { + map.set(entry.legacy.id_str, entry.legacy); + } } + } catch (error) { + console.error(`no pude capturar pedido API`, error); } }); diff --git a/src/lib/data-processing/mostLiked.ts b/src/lib/data-processing/mostLiked.ts new file mode 100644 index 0000000..6272669 --- /dev/null +++ b/src/lib/data-processing/mostLiked.ts @@ -0,0 +1,21 @@ +import type { LikedTweet } from "../../schema"; + +export function getUsernameFromUrl(url: string): string | null { + const matches = url.match(/^https:\/\/twitter.com\/(.+?)\//); + if (!matches) return null; + const [, username] = matches; + return username; +} + +export function sortMost(tweets: LikedTweet[]) { + const map = new Map(); + for (const tweet of tweets) { + const username = getUsernameFromUrl(tweet.url); + if (!username) continue; + map.set(username, (map.get(username) ?? 0) + 1); + } + return Array.from(map) + .filter(([, n]) => n > 3) + .sort(([, a], [, b]) => b - a) + .slice(0, 10); +} diff --git a/src/lib/data-processing/screenTime.ts b/src/lib/data-processing/screenTime.ts new file mode 100644 index 0000000..8c3ed27 --- /dev/null +++ b/src/lib/data-processing/screenTime.ts @@ -0,0 +1,86 @@ +import type { Dayjs } from "dayjs"; +import type { LikedTweet } from "../../schema"; +import dayjs from "dayjs"; +import { formatDuration, intervalToDuration } from "date-fns"; +import { es } from "date-fns/locale"; + +export type Duration = { start: Dayjs; end: Dayjs }; +export function calculateScreenTime(tweets: LikedTweet[]): Duration[] { + const n = 3; + const durations = tweets + .map((t) => dayjs(t.firstSeenAt)) + .map((d) => ({ start: d, end: d.add(n, "minute") })); + + type StartEnd = { + type: "start" | "end"; + date: Dayjs; + }; + const startEnds: Array = durations + .flatMap(({ start, end }) => [ + { type: "start", date: start }, + { type: "end", date: end }, + ]) + .sort(({ date: a }, { date: b }) => a.diff(b)); + + // console.debug(startEnds.map((x) => [x.type, x.date.toDate()])); + + let finalStartEnds: Array = []; + + // https://stackoverflow.com/questions/45109429/merge-sets-of-overlapping-time-periods-into-new-one + let i = 0; + for (const startEnd of startEnds) { + if (startEnd.type === "start") { + i++; + if (i === 1) finalStartEnds.push(startEnd); + } else { + if (i === 1) finalStartEnds.push(startEnd); + i--; + } + } + // console.debug(finalStartEnds.map((x) => [x.type, x.date.toDate()])); + + let finalDurations: Array = []; + + while (finalStartEnds.length > 0) { + const [start, end] = finalStartEnds.splice(0, 2); + if (start.type !== "start") throw new Error("expected start"); + if (end.type !== "end") throw new Error("expected end"); + finalDurations.push({ + start: start.date, + end: end.date.subtract(n, "minute").add(2, "minute"), + }); + } + return finalDurations; +} + +/** + * @returns number - en milisegundos + */ +export function totalFromDurations(durations: Duration[]): number { + let total = 0; + for (const duration of durations) { + const time = duration.end.diff(duration.start); + total += time; + } + return total; +} + +// https://stackoverflow.com/a/65711327 +export function formatDurationFromMs(ms: number) { + const duration = intervalToDuration({ start: 0, end: ms }); + return formatDuration(duration, { + locale: es, + delimiter: ", ", + format: ["hours", "minutes"], + }); +} +export function formatTinyDurationFromMs(ms: number) { + const duration = intervalToDuration({ start: 0, end: ms }); + // https://github.com/date-fns/date-fns/issues/2134 + return formatDuration(duration, { + locale: es, + format: ["hours", "minutes"], + }) + .replace(/ horas?/, "h") + .replace(/ minutos?/, "m"); +} diff --git a/src/lib/data-processing/sleepTime.ts b/src/lib/data-processing/sleepTime.ts new file mode 100644 index 0000000..d9279f0 --- /dev/null +++ b/src/lib/data-processing/sleepTime.ts @@ -0,0 +1,33 @@ +import dayjs from "dayjs"; +import Tz from "dayjs/plugin/timezone"; +dayjs.extend(Tz); +import type { MiniLikedTweet } from "../../schema"; + +export function getLastSleepTime(likedTweets: MiniLikedTweet[]) { + const diffs = likedTweets + .sort((a, b) => -b.firstSeenAt - -a.firstSeenAt) + .map((value, index, array) => { + const next = array[index + 1]; + if (next) { + return { ...value, diff: +next.firstSeenAt - +value.firstSeenAt }; + } + return value; + }) + .toReversed() + .map((x, index) => ({ ...x, index })); + const last = diffs.find( + (x): x is { index: number; diff: number } & MiniLikedTweet => { + const time = dayjs(x.firstSeenAt) + .tz("America/Argentina/Buenos_Aires") + .hour(); + return ( + "diff" in x && x.diff > 5 * 60 * 60 * 1000 && (time > 21 || time < 4) + ); + }, + ); + if (!last) return null; + const ultimoTuitAntesDeDormir = last; + const primerTuitAlDespertarse = diffs[last.index - 1]; + + return { primerTuitAlDespertarse, ultimoTuitAntesDeDormir }; +} diff --git a/src/lib/data-processing/weekly.ts b/src/lib/data-processing/weekly.ts new file mode 100644 index 0000000..f45dc62 --- /dev/null +++ b/src/lib/data-processing/weekly.ts @@ -0,0 +1,44 @@ +import dayjs from "dayjs"; +import Utc from "dayjs/plugin/utc"; +import Tz from "dayjs/plugin/timezone"; +dayjs.extend(Utc); +dayjs.extend(Tz); +import type { LikedTweet, MiniRetweet } from "../../schema"; +import { calculateScreenTime, totalFromDurations } from "./screenTime"; + +export function lastWeek( + allLiked: Array, + allRetweets: Array, +) { + const today = dayjs + .tz(undefined, "America/Argentina/Buenos_Aires") + .startOf("day"); + + const days = [ + today.subtract(7, "day"), + today.subtract(6, "day"), + today.subtract(5, "day"), + today.subtract(4, "day"), + today.subtract(3, "day"), + today.subtract(2, "day"), + today.subtract(1, "day"), + today, + ]; + + return days.map((day) => { + const tweets = allLiked.filter((t) => { + const date = dayjs(t.firstSeenAt); + return date.isAfter(day) && date.isBefore(day.add(1, "day")); + }); + const retweets = allRetweets.filter((t) => { + const date = dayjs(t.retweetAt); + return date.isAfter(day) && date.isBefore(day.add(1, "day")); + }); + return { + day, + tweets, + retweets, + screenTime: totalFromDurations(calculateScreenTime(tweets)), + }; + }); +} diff --git a/src/routes/+page.server.ts b/src/routes/+page.server.ts index a56e71a..1e58dd1 100644 --- a/src/routes/+page.server.ts +++ b/src/routes/+page.server.ts @@ -1,5 +1,5 @@ import { db } from "$lib/db"; -import { desc } from "drizzle-orm"; +import { desc, isNotNull } from "drizzle-orm"; import { likedTweets, retweets, scraps } from "../schema"; import type { PageServerLoad } from "./$types"; @@ -20,8 +20,9 @@ export const load: PageServerLoad = async ({ params, setHeaders }) => { }, orderBy: desc(retweets.retweetAt), }); - const lastUpdated = await db.query.likedTweets.findFirst({ - orderBy: desc(likedTweets.firstSeenAt), + const lastUpdated = await db.query.scraps.findFirst({ + orderBy: desc(scraps.at), + where: isNotNull(scraps.totalTweetsSeen), }); setHeaders({ diff --git a/src/routes/+page.svelte b/src/routes/+page.svelte index b08cea9..74c1c07 100644 --- a/src/routes/+page.svelte +++ b/src/routes/+page.svelte @@ -1,17 +1,19 @@ + +
+
+ +
¡Ey!
+
+ Esta página es un experimento, y puede ser particularmente imprecisa o en + general ser una bosta. +
+
+ +

Últimos 200 likes de @JMilei

+ +
    + {#each data.tweets as tweet} +
  • + @{getUsernameFromUrl(tweet.url)}: + {tweet.text} + (likeado aprox. {timeFormatter.format(tweet.firstSeenAt)}) +
  • + {/each} +
+