From 71643d296f61ae2b13e25f935112896c2a13ab54 Mon Sep 17 00:00:00 2001 From: Yashwardhan Jyani <100014271+yashwardhan-jyani@users.noreply.github.com> Date: Wed, 17 Apr 2024 20:32:38 +0530 Subject: [PATCH] Fix part of #19636: Added beam jobs for user bio validation (#19993) * added beam job * linting * Small Changes * linting * minor fixes * use username instead user ID * lint checks * license date correction --- core/jobs/batch_jobs/user_validation_jobs.py | 91 +++++++++++++++++ .../batch_jobs/user_validation_jobs_test.py | 99 +++++++++++++++++++ core/jobs/registry.py | 1 + scripts/backend_test_shards.json | 1 + .../linters/test_files/valid_job_imports.py | 1 + 5 files changed, 193 insertions(+) create mode 100644 core/jobs/batch_jobs/user_validation_jobs.py create mode 100644 core/jobs/batch_jobs/user_validation_jobs_test.py diff --git a/core/jobs/batch_jobs/user_validation_jobs.py b/core/jobs/batch_jobs/user_validation_jobs.py new file mode 100644 index 000000000000..0d1ab4101ad0 --- /dev/null +++ b/core/jobs/batch_jobs/user_validation_jobs.py @@ -0,0 +1,91 @@ +# coding: utf-8 +# +# Copyright 2024 The Oppia Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS-IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Validation Jobs for user models.""" + +from __future__ import annotations + +from core.jobs import base_jobs +from core.jobs.io import ndb_io +from core.jobs.transforms import job_result_transforms +from core.jobs.types import job_run_result +from core.platform import models + +import apache_beam as beam + +MYPY = False +if MYPY: # pragma: no cover + from mypy_imports import user_models + +(user_models,) = models.Registry.import_models([models.Names.USER]) + + +class GetUsersWithInvalidBioJob(base_jobs.JobBase): + """Validates that no user has a null bio + or a bio with length greater than 2000. + """ + + def run(self) -> beam.PCollection[job_run_result.JobRunResult]: + user_usernames_and_bios = ( + self.pipeline + | 'Get all UserSettingsModels' >> ( + ndb_io.GetModels(user_models.UserSettingsModel.get_all())) + | 'Extract username and bio from model' >> beam.Map( + lambda user_settings: ( + user_settings.username, user_settings.user_bio)) + ) + + users_with_invalid_bios = ( + user_usernames_and_bios + | 'Get users with null bio or bio with length greater than 2000' >> + beam.Filter( + lambda user_username_and_bio: + not isinstance(user_username_and_bio[1], str) + or len(user_username_and_bio[1]) > 2000) + ) + + number_of_users_queried_report = ( + user_usernames_and_bios + | 'Report count of user models' >> ( + job_result_transforms.CountObjectsToJobRunResult( + 'CountTotalUsers')) + ) + + number_of_users_with_invalid_bio_report = ( + users_with_invalid_bios + | 'Report count of invalid user models' >> ( + job_result_transforms.CountObjectsToJobRunResult( + 'CountInvalidUserBios')) + ) + + invalid_user_usernames_and_bios_report = ( + users_with_invalid_bios + | 'Report info on each invalid user bio' >> beam.Map( + lambda user_username_and_bio: + job_run_result.JobRunResult.as_stderr( + 'The username of user is "%s" and their bio is "%s"' + % (user_username_and_bio[0], user_username_and_bio[1]) + )) + ) + + return ( + ( + number_of_users_queried_report, + number_of_users_with_invalid_bio_report, + invalid_user_usernames_and_bios_report, + ) + | 'Combine reported results' >> beam.Flatten() + ) diff --git a/core/jobs/batch_jobs/user_validation_jobs_test.py b/core/jobs/batch_jobs/user_validation_jobs_test.py new file mode 100644 index 000000000000..abe74dba88b5 --- /dev/null +++ b/core/jobs/batch_jobs/user_validation_jobs_test.py @@ -0,0 +1,99 @@ +# coding: utf-8 +# +# Copyright 2024 The Oppia Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS-IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for jobs.batch_jobs.user_validation_jobs.""" + +from __future__ import annotations + +from core.jobs import job_test_utils +from core.jobs.batch_jobs import user_validation_jobs +from core.jobs.types import job_run_result +from core.platform import models + +from typing import Final, Type + +MYPY = False +if MYPY: + from mypy_imports import user_models + +(user_models,) = models.Registry.import_models([models.Names.USER]) + + +class GetUsersWithInvalidBioJobTests(job_test_utils.JobTestBase): + + JOB_CLASS: Type[ + user_validation_jobs.GetUsersWithInvalidBioJob + ] = user_validation_jobs.GetUsersWithInvalidBioJob + + USER_USERNAME_1: Final = 'user_1' + USER_USERNAME_2: Final = 'user_2' + USER_USERNAME_3: Final = 'user_3' + + def test_empty_storage(self) -> None: + self.assert_job_output_is_empty() + + def test_user_with_null_bio(self) -> None: + user = self.create_model( + user_models.UserSettingsModel, + username=self.USER_USERNAME_1, + email='a@a.com' + ) + user.update_timestamps() + self.put_multi([user]) + + self.assert_job_output_is([ + job_run_result.JobRunResult( + stderr='The username of user is "user_1"' + + ' and their bio is "None"'), + job_run_result.JobRunResult( + stdout='CountInvalidUserBios SUCCESS: 1'), + job_run_result.JobRunResult( + stdout='CountTotalUsers SUCCESS: 1') + ]) + + def test_user_with_valid_bio(self) -> None: + user = self.create_model( + user_models.UserSettingsModel, + username=self.USER_USERNAME_2, + email='b@b.com', + user_bio='Test Bio' + ) + user.update_timestamps() + self.put_multi([user]) + + self.assert_job_output_is([ + job_run_result.JobRunResult( + stdout='CountTotalUsers SUCCESS: 1') + ]) + + def test_user_with_too_long_bio(self) -> None: + user = self.create_model( + user_models.UserSettingsModel, + username=self.USER_USERNAME_3, + email='c@c.com', + user_bio='Lorem ipsum dolor sit amet, consectetur adipiscing elit. Duis convallis ut felis eget fringilla. Phasellus congue quam et odio venenatis viverra. Maecenas pharetra, dui a convallis vestibulum, augue sem posuere enim, ac hendrerit nisl diam eu dolor. Sed mattis risus in quam mollis pellentesque. Proin mi nisl, dignissim at euismod in, fermentum ut nisi. Donec vel mauris ipsum. Nulla quis egestas nisl. Phasellus urna nisi, iaculis tincidunt dui volutpat, sagittis congue urna. Suspendisse commodo mollis sem, porttitor molestie justo laoreet in. Nullam facilisis dui sapien, et viverra lorem elementum vel.Integer tincidunt feugiat orci, vitae molestie erat facilisis eget. Nullam venenatis, tellus gravida varius condimentum, orci nunc ornare purus, quis pretium diam nulla vitae ipsum. In vel lorem consectetur, mattis enim a, varius magna. Nunc consequat nisl vel mi feugiat, a consequat turpis dapibus. Cras lectus magna, ullamcorper id orci vel, malesuada hendrerit enim. Nunc in quam felis. Proin posuere justo sed consectetur molestie. Quisque non aliquet magna. Vivamus non vulputate augue, quis placerat est.Nullam vel ornare arcu. Integer ornare est lacinia ligula vehicula efficitur. Aliquam varius elit sit amet eros vestibulum, eu pharetra justo maximus. Proin a sagittis felis, ac tempus ipsum. Cras egestas lorem quis ante ullamcorper, vitae accumsan sapien luctus. Nulla sodales elit sit amet dignissim ornare. In non porttitor tellus, sit amet interdum nisl. Vivamus ut lobortis lacus. Duis feugiat tempor eros vitae aliquet. Integer varius elit quis erat cursus, faucibus bibendum sapien varius. In ut luctus elit, bibendum posuere felis. Donec pretium enim id eleifend venenatis. Cras aliquet magna nec ante sodales, vel imperdiet velit posuere.Nunc nulla sem, condimentum sit amet tempor eu, pharetra at nulla. Nulla auctor pellentesque condimentum. In vestibulum, lectus nec pulvinar dignissim, elit quam viverra metus, ut fermentum ipsum dui ac mauris. Aliquam imperdiet dictum nulla, eget dignissim risus vehicula sit amet. Nam et blandit turpis, ut varius nulla. Mauris dui.' # pylint: disable=line-too-long + ) + user.update_timestamps() + self.put_multi([user]) + + self.assert_job_output_is([ + job_run_result.JobRunResult( + stderr='The username of user is "user_3" and their bio is "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Duis convallis ut felis eget fringilla. Phasellus congue quam et odio venenatis viverra. Maecenas pharetra, dui a convallis vestibulum, augue sem posuere enim, ac hendrerit nisl diam eu dolor. Sed mattis risus in quam mollis pellentesque. Proin mi nisl, dignissim at euismod in, fermentum ut nisi. Donec vel mauris ipsum. Nulla quis egestas nisl. Phasellus urna nisi, iaculis tincidunt dui volutpat, sagittis congue urna. Suspendisse commodo mollis sem, porttitor molestie justo laoreet in. Nullam facilisis dui sapien, et viverra lorem elementum vel.Integer tincidunt feugiat orci, vitae molestie erat facilisis eget. Nullam venenatis, tellus gravida varius condimentum, orci nunc ornare purus, quis pretium diam nulla vitae ipsum. In vel lorem consectetur, mattis enim a, varius magna. Nunc consequat nisl vel mi feugiat, a consequat turpis dapibus. Cras lectus magna, ullamcorper id orci vel, malesuada hendrerit enim. Nunc in quam felis. Proin posuere justo sed consectetur molestie. Quisque non aliquet magna. Vivamus non vulputate augue, quis placerat est.Nullam vel ornare arcu. Integer ornare est lacinia ligula vehicula efficitur. Aliquam varius elit sit amet eros vestibulum, eu pharetra justo maximus. Proin a sagittis felis, ac tempus ipsum. Cras egestas lorem quis ante ullamcorper, vitae accumsan sapien luctus. Nulla sodales elit sit amet dignissim ornare. In non porttitor tellus, sit amet interdum nisl. Vivamus ut lobortis lacus. Duis feugiat tempor eros vitae aliquet. Integer varius elit quis erat cursus, faucibus bibendum sapien varius. In ut luctus elit, bibendum posuere felis. Donec pretium enim id eleifend venenatis. Cras aliquet magna nec ante sodales, vel imperdiet velit posuere.Nunc nulla sem, condimentum sit amet tempor eu, pharetra at nulla. Nulla auctor pellentesque condimentum. In vestibulum, lectus nec pulvinar dignissim, elit quam viverra metus, ut fermentum ipsum dui ac mauris. Aliquam imperdiet dictum nulla, eget dignissim risus vehicula sit amet. Nam et blandit turpis, ut varius nulla. Mauris dui."'), # pylint: disable=line-too-long + job_run_result.JobRunResult( + stdout='CountInvalidUserBios SUCCESS: 1'), + job_run_result.JobRunResult( + stdout='CountTotalUsers SUCCESS: 1') + ]) diff --git a/core/jobs/registry.py b/core/jobs/registry.py index 172758d7ef17..48efb89d9c3a 100644 --- a/core/jobs/registry.py +++ b/core/jobs/registry.py @@ -45,6 +45,7 @@ # of every job. from core.jobs.batch_jobs import blog_post_search_indexing_jobs # pylint: disable=unused-import # isort: skip from core.jobs.batch_jobs import blog_validation_jobs # pylint: disable=unused-import # isort: skip +from core.jobs.batch_jobs import user_validation_jobs # pylint: disable=unused-import # isort: skip from core.jobs.batch_jobs import collection_info_jobs # pylint: disable=unused-import # isort: skip from core.jobs.batch_jobs import email_deletion_jobs # pylint: disable=unused-import # isort: skip from core.jobs.batch_jobs import exp_migration_jobs # pylint: disable=unused-import # isort: skip diff --git a/scripts/backend_test_shards.json b/scripts/backend_test_shards.json index 5980274ddc60..3e7e4b088281 100644 --- a/scripts/backend_test_shards.json +++ b/scripts/backend_test_shards.json @@ -369,6 +369,7 @@ "core.jobs.batch_jobs.blog_validation_jobs_test", "core.jobs.transforms.validation.blog_validation_test", "core.jobs.transforms.validation.skill_validation_test", + "core.jobs.batch_jobs.user_validation_jobs_test", "core.jobs.transforms.validation.story_validation_test", "core.tests.load_tests.feedback_thread_summaries_test", "core.tests.test_utils_test", diff --git a/scripts/linters/test_files/valid_job_imports.py b/scripts/linters/test_files/valid_job_imports.py index b32e87bcb2a2..6a72015fd5b5 100644 --- a/scripts/linters/test_files/valid_job_imports.py +++ b/scripts/linters/test_files/valid_job_imports.py @@ -21,6 +21,7 @@ from __future__ import annotations from core.jobs.batch_jobs import blog_post_search_indexing_jobs # pylint: disable=unused-import # isort: skip from core.jobs.batch_jobs import blog_validation_jobs # pylint: disable=unused-import # isort: skip +from core.jobs.batch_jobs import user_validation_jobs # pylint: disable=unused-import # isort: skip from core.jobs.batch_jobs import collection_info_jobs # pylint: disable=unused-import # isort: skip from core.jobs.batch_jobs import email_deletion_jobs # pylint: disable=unused-import # isort: skip from core.jobs.batch_jobs import exp_migration_jobs # pylint: disable=unused-import # isort: skip