diff --git a/src/stats/meme.py b/src/stats/meme.py index 8f96b85..2a89ed6 100644 --- a/src/stats/meme.py +++ b/src/stats/meme.py @@ -3,8 +3,34 @@ from src.database import execute -async def calculate_meme_reactions_stats() -> None: - insert_query = """ +async def calculate_meme_reactions_stats( + min_user_reactions=10, + min_meme_reactions=3) -> None: + """ + lr_smoothed algorithm + + Smoothing is needed to handle the cases when users have very diverse like rates + + Step 1. Change target to the symmetrical form: + + like_symmetrical: (0, 1) -> (-1, 1) + + Step 2. Calculate symmetrical like rate for each user + + user_like_rate = avg(like_symmetrical) + + Step 3. Calculate smoothed likes + + like_smoothed = like_symmetrical - user_like_rate + like_smoothed in (-2, 2) + + Step 4. Calculate meme smoothed like rate + + lr_smoothed = avg(like_smoothed) + lr_smoothed in (-2, 2) + """ + + insert_query = f""" INSERT INTO meme_stats ( meme_id, nlikes, @@ -12,23 +38,60 @@ async def calculate_meme_reactions_stats() -> None: nmemes_sent, age_days, sec_to_react, - updated_at + updated_at, + lr_smoothed ) + + WITH LR AS ( + SELECT * + FROM ( + SELECT meme_id, AVG(lr_smoothed) lr_smoothed, COUNT(user_id) n_reactions + FROM ( + SELECT *, (like_ - lr_avg) lr_smoothed + FROM ( SELECT - meme_id - , COUNT(*) FILTER (WHERE reaction_id = 1) nlikes - , COUNT(*) FILTER (WHERE reaction_id = 2) ndislikes - , COUNT(*) nmemes_sent - , MAX(EXTRACT('DAYS' FROM NOW() - M.published_at)) age_days - , COALESCE(EXTRACT( - EPOCH FROM - percentile_cont(0.5) WITHIN GROUP (ORDER BY reacted_at - sent_at) - ), 99999) AS sec_to_react - , NOW() AS updated_at - FROM user_meme_reaction E - INNER JOIN meme M - ON M.id = E.meme_id - GROUP BY 1 + user_id, + meme_id, + like_, + sent_at, + AVG(like_) OVER (PARTITION BY user_id ORDER BY sent_at) lr_avg, + COUNT(like_) over (PARTITION BY user_id ORDER BY sent_at) n_reactions + FROM ( + SELECT *, CASE WHEN reaction_id = 1 THEN 1 ELSE -1 END like_ + FROM user_meme_reaction r + JOIN meme + ON r.meme_id = meme.id + WHERE reaction_id IS NOT NULL + ) t + ) t + WHERE n_reactions >= {min_user_reactions} + ) t + GROUP BY meme_id + ) t + WHERE n_reactions >= {min_meme_reactions} + ) + SELECT + MS.*, + COALESCE(LR.lr_smoothed, 0) lr_smoothed + FROM ( + SELECT + meme_id + , COUNT(*) FILTER (WHERE reaction_id = 1) nlikes + , COUNT(*) FILTER (WHERE reaction_id = 2) ndislikes + , COUNT(*) nmemes_sent + , MAX(EXTRACT('DAYS' FROM NOW() - M.published_at)) age_days + , COALESCE(EXTRACT( + EPOCH FROM + percentile_cont(0.5) WITHIN GROUP (ORDER BY reacted_at - sent_at) + ), 99999) AS sec_to_react + , NOW() AS updated_at + FROM user_meme_reaction E + INNER JOIN meme M + ON M.id = E.meme_id + GROUP BY 1 + ) MS + LEFT JOIN LR + ON MS.meme_id = LR.meme_id ON CONFLICT (meme_id) DO UPDATE SET @@ -37,7 +100,8 @@ async def calculate_meme_reactions_stats() -> None: nmemes_sent = EXCLUDED.nmemes_sent, age_days = EXCLUDED.age_days, sec_to_react = EXCLUDED.sec_to_react, - updated_at = EXCLUDED.updated_at + updated_at = EXCLUDED.updated_at, + lr_smoothed = EXCLUDED.lr_smoothed """ await execute(text(insert_query)) diff --git a/tests/stats/test_meme.py b/tests/stats/test_meme.py index 643ec61..ad45745 100644 --- a/tests/stats/test_meme.py +++ b/tests/stats/test_meme.py @@ -27,7 +27,7 @@ async def conn(): 'type': 'image', 'telegram_image_id': '111', 'caption': '111', 'meme_source_id': 1, 'published_at': datetime(2024, 1, 1), 'status': 'ok', 'language_code': 'ru', } - meme_ids = [1, 2, 3, 4, 5] + meme_ids = [1, 2, 3, 4, 5, 6] await conn.execute( insert(meme), [{'id': meme_id, 'raw_meme_id': meme_id, **meme_common} for meme_id in meme_ids] @@ -49,12 +49,14 @@ async def conn(): {'user_id': 1, 'meme_id': 2, 'reaction_id': 1, **umr_common}, {'user_id': 1, 'meme_id': 3, 'reaction_id': 1, **umr_common}, {'user_id': 1, 'meme_id': 4, 'reaction_id': 1, **umr_common}, - {'user_id': 1, 'meme_id': 5, 'reaction_id': 2, **umr_common}, + {'user_id': 1, 'meme_id': 5, 'reaction_id': 1, **umr_common}, + {'user_id': 1, 'meme_id': 6, 'reaction_id': 2, **umr_common}, {'user_id': 2, 'meme_id': 1, 'reaction_id': 1, **umr_common}, {'user_id': 2, 'meme_id': 2, 'reaction_id': 2, **umr_common}, {'user_id': 2, 'meme_id': 3, 'reaction_id': 2, **umr_common}, {'user_id': 2, 'meme_id': 4, 'reaction_id': 2, **umr_common}, {'user_id': 2, 'meme_id': 5, 'reaction_id': 2, **umr_common}, + {'user_id': 2, 'meme_id': 6, 'reaction_id': 2, **umr_common}, ] ) @@ -73,8 +75,16 @@ async def conn(): @pytest.mark.asyncio async def test_calculate_meme_reactions_stats(conn: AsyncConnection): - await calculate_meme_reactions_stats() + await calculate_meme_reactions_stats(min_meme_reactions=0, min_user_reactions=0) res = await fetch_all(select(meme_stats)) + assert len(res) == 6 + print(res) - assert len(res) == 5 \ No newline at end of file + + eps = 1e-3 + for row in res: + if row['meme_id'] == 1: + assert abs(row['lr_smoothed'] - 1) < eps + if row['meme_id'] == 2: + assert abs(row['lr_smoothed']) < eps \ No newline at end of file