diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 507949604e..49f82d12e1 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -23,11 +23,13 @@ DeleteOrgJob, RecalculateOrgStatsJob, ReAddOrgPagesJob, + OptimizePagesJob, PaginatedBackgroundJobResponse, AnyJob, StorageRef, User, SuccessResponse, + SuccessResponseId, ) from .pagination import DEFAULT_PAGE_SIZE, paginated_format from .utils import dt_now @@ -52,6 +54,8 @@ class BackgroundJobOps: base_crawl_ops: BaseCrawlOps profile_ops: ProfileOps + migration_jobs_scale: int + # pylint: disable=too-many-locals, too-many-arguments, invalid-name def __init__(self, mdb, email, user_manager, org_ops, crawl_manager, storage_ops): @@ -67,6 +71,8 @@ def __init__(self, mdb, email, user_manager, org_ops, crawl_manager, storage_ops self.base_crawl_ops = cast(BaseCrawlOps, None) self.profile_ops = cast(ProfileOps, None) + self.migration_jobs_scale = int(os.environ.get("MIGRATION_JOBS_SCALE", 1)) + self.router = APIRouter( prefix="/jobs", tags=["jobs"], @@ -382,6 +388,7 @@ async def create_re_add_org_pages_job( self, oid: UUID, crawl_type: Optional[str] = None, + crawl_id: Optional[str] = None, existing_job_id: Optional[str] = None, ): """Create job to (re)add all pages in an org, optionally filtered by crawl type""" @@ -390,6 +397,7 @@ async def create_re_add_org_pages_job( job_id = await self.crawl_manager.run_re_add_org_pages_job( oid=str(oid), crawl_type=crawl_type, + crawl_id=crawl_id, existing_job_id=existing_job_id, ) if existing_job_id: @@ -410,6 +418,7 @@ async def create_re_add_org_pages_job( id=job_id, oid=oid, crawl_type=crawl_type, + crawl_id=crawl_id, started=dt_now(), ) @@ -424,18 +433,58 @@ async def create_re_add_org_pages_job( print(f"warning: re-add org pages job could not be started: {exc}") return None + async def create_optimize_crawl_pages_job( + self, + existing_job_id: Optional[str] = None, + ): + """Create job to optimize crawl pages""" + + try: + job_id = await self.crawl_manager.run_optimize_pages_job( + existing_job_id=existing_job_id, scale=self.migration_jobs_scale + ) + if existing_job_id: + optimize_pages_job = await self.get_background_job(existing_job_id) + previous_attempt = { + "started": optimize_pages_job.started, + "finished": optimize_pages_job.finished, + } + if optimize_pages_job.previousAttempts: + optimize_pages_job.previousAttempts.append(previous_attempt) + else: + optimize_pages_job.previousAttempts = [previous_attempt] + optimize_pages_job.started = dt_now() + optimize_pages_job.finished = None + optimize_pages_job.success = None + else: + optimize_pages_job = OptimizePagesJob( + id=job_id, + started=dt_now(), + ) + + await self.jobs.find_one_and_update( + {"_id": job_id}, {"$set": optimize_pages_job.to_dict()}, upsert=True + ) + + return job_id + # pylint: disable=broad-exception-caught + except Exception as exc: + # pylint: disable=raise-missing-from + print(f"warning: optimize pages job could not be started: {exc}") + return None + async def job_finished( self, job_id: str, job_type: str, - oid: UUID, success: bool, finished: datetime, + oid: Optional[UUID] = None, ) -> None: """Update job as finished, including job-specific task handling""" - job = await self.get_background_job(job_id, oid) + job = await self.get_background_job(job_id) if job.finished: return @@ -455,14 +504,16 @@ async def job_finished( flush=True, ) superuser = await self.user_manager.get_superuser() - org = await self.org_ops.get_org_by_id(job.oid) + org = None + if job.oid: + org = await self.org_ops.get_org_by_id(job.oid) await asyncio.get_event_loop().run_in_executor( None, self.email.send_background_job_failed, job, - org, finished, superuser.email, + org, ) await self.jobs.find_one_and_update( @@ -478,6 +529,7 @@ async def get_background_job( DeleteOrgJob, RecalculateOrgStatsJob, ReAddOrgPagesJob, + OptimizePagesJob, ]: """Get background job""" query: dict[str, object] = {"_id": job_id} @@ -504,11 +556,14 @@ def _get_job_by_type_from_data(self, data: dict[str, object]): if data["type"] == BgJobType.READD_ORG_PAGES: return ReAddOrgPagesJob.from_dict(data) + if data["type"] == BgJobType.OPTIMIZE_PAGES: + return OptimizePagesJob.from_dict(data) + return DeleteOrgJob.from_dict(data) async def list_background_jobs( self, - org: Organization, + org: Optional[Organization] = None, page_size: int = DEFAULT_PAGE_SIZE, page: int = 1, success: Optional[bool] = None, @@ -522,7 +577,10 @@ async def list_background_jobs( page = page - 1 skip = page_size * page - query: dict[str, object] = {"oid": org.id} + query: dict[str, object] = {} + + if org: + query["oid"] = org.id if success in (True, False): query["success"] = success @@ -590,10 +648,10 @@ async def get_replica_job_file( raise HTTPException(status_code=404, detail="file_not_found") async def retry_background_job( - self, job_id: str, org: Organization - ) -> Dict[str, Union[bool, Optional[str]]]: + self, job_id: str, org: Optional[Organization] = None + ): """Retry background job""" - job = await self.get_background_job(job_id, org.id) + job = await self.get_background_job(job_id) if not job: raise HTTPException(status_code=404, detail="job_not_found") @@ -603,7 +661,23 @@ async def retry_background_job( if job.success: raise HTTPException(status_code=400, detail="job_already_succeeded") + if org: + return await self.retry_org_background_job(job, org) + + if job.type == BgJobType.OPTIMIZE_PAGES: + await self.create_optimize_crawl_pages_job( + existing_job_id=job_id, + ) + return {"success": True} + + return {"success": False} + + async def retry_org_background_job( + self, job: BackgroundJob, org: Organization + ) -> Dict[str, Union[bool, Optional[str]]]: + """Retry background job specific to one org""" if job.type == BgJobType.CREATE_REPLICA: + job = cast(CreateReplicaJob, job) file = await self.get_replica_job_file(job, org) primary_storage = self.storage_ops.get_org_storage_by_ref(org, file.storage) primary_endpoint, bucket_suffix = self.strip_bucket( @@ -618,10 +692,12 @@ async def retry_background_job( job.replica_storage, primary_file_path, primary_endpoint, - existing_job_id=job_id, + existing_job_id=job.id, ) + return {"success": True} if job.type == BgJobType.DELETE_REPLICA: + job = cast(DeleteReplicaJob, job) file = await self.get_replica_job_file(job, org) await self.create_delete_replica_job( org, @@ -630,31 +706,39 @@ async def retry_background_job( job.object_type, job.replica_storage, force_start_immediately=True, - existing_job_id=job_id, + existing_job_id=job.id, ) + return {"success": True} if job.type == BgJobType.DELETE_ORG: + job = cast(DeleteOrgJob, job) await self.create_delete_org_job( org, - existing_job_id=job_id, + existing_job_id=job.id, ) + return {"success": True} if job.type == BgJobType.RECALCULATE_ORG_STATS: + job = cast(RecalculateOrgStatsJob, job) await self.create_recalculate_org_stats_job( org, - existing_job_id=job_id, + existing_job_id=job.id, ) + return {"success": True} if job.type == BgJobType.READD_ORG_PAGES: + job = cast(ReAddOrgPagesJob, job) await self.create_re_add_org_pages_job( org.id, job.crawl_type, - existing_job_id=job_id, + job.crawl_id, + existing_job_id=job.id, ) + return {"success": True} - return {"success": True} + return {"success": False} - async def retry_failed_background_jobs( + async def retry_failed_org_background_jobs( self, org: Organization ) -> Dict[str, Union[bool, Optional[str]]]: """Retry all failed background jobs in an org @@ -679,7 +763,9 @@ async def retry_all_failed_background_jobs( """ bg_tasks = set() async for job in self.jobs.find({"success": False}): - org = await self.org_ops.get_org_by_id(job["oid"]) + org = None + if job.get("oid"): + org = await self.org_ops.get_org_by_id(job["oid"]) task = asyncio.create_task(self.retry_background_job(job["_id"], org)) bg_tasks.add(task) task.add_done_callback(bg_tasks.discard) @@ -707,14 +793,14 @@ def init_background_jobs_api( "/{job_id}", response_model=AnyJob, ) - async def get_background_job( + async def get_org_background_job( job_id: str, org: Organization = Depends(org_crawl_dep), ): """Retrieve information for background job""" return await ops.get_background_job(job_id, org.id) - @app.get("/orgs/all/jobs/{job_id}", response_model=SuccessResponse, tags=["jobs"]) + @app.get("/orgs/all/jobs/{job_id}", response_model=AnyJob, tags=["jobs"]) async def get_background_job_all_orgs(job_id: str, user: User = Depends(user_dep)): """Get background job from any org""" if not user.is_superuser: @@ -722,8 +808,36 @@ async def get_background_job_all_orgs(job_id: str, user: User = Depends(user_dep return await ops.get_background_job(job_id) - @router.post("/{job_id}/retry", response_model=SuccessResponse) - async def retry_background_job( + @app.post( + "/orgs/all/jobs/{job_id}/retry", response_model=SuccessResponse, tags=["jobs"] + ) + async def retry_background_job_no_org(job_id: str, user: User = Depends(user_dep)): + """Retry backgound job that doesn't belong to an org, e.g. migration job""" + if not user.is_superuser: + raise HTTPException(status_code=403, detail="Not Allowed") + + job = await ops.get_background_job(job_id) + + org = None + if job.oid: + org = await ops.org_ops.get_org_by_id(job.oid) + + return await ops.retry_background_job(job_id, org) + + @app.post( + "/orgs/all/jobs/migrateCrawls", response_model=SuccessResponseId, tags=["jobs"] + ) + async def create_migrate_crawls_job(job_id: str, user: User = Depends(user_dep)): + """Launch background job to migrate all crawls to v2 with optimized pages""" + if not user.is_superuser: + raise HTTPException(status_code=403, detail="Not Allowed") + + job_id = await ops.create_optimize_crawl_pages_job() + + return {"success": True, "id": job_id} + + @router.post("/{job_id}/retry", response_model=SuccessResponse, tags=["jobs"]) + async def retry_org_background_job( job_id: str, org: Organization = Depends(org_crawl_dep), ): @@ -740,14 +854,41 @@ async def retry_all_failed_background_jobs(user: User = Depends(user_dep)): return await ops.retry_all_failed_background_jobs() - @router.post("/retryFailed", response_model=SuccessResponse) - async def retry_failed_background_jobs( + @router.post("/retryFailed", response_model=SuccessResponse, tags=["jobs"]) + async def retry_failed_org_background_jobs( org: Organization = Depends(org_crawl_dep), ): """Retry failed background jobs""" - return await ops.retry_failed_background_jobs(org) + return await ops.retry_failed_org_background_jobs(org) + + @app.get( + "/orgs/all/jobs", response_model=PaginatedBackgroundJobResponse, tags=["jobs"] + ) + async def list_all_background_jobs( + pageSize: int = DEFAULT_PAGE_SIZE, + page: int = 1, + success: Optional[bool] = None, + jobType: Optional[str] = None, + sortBy: Optional[str] = None, + sortDirection: Optional[int] = -1, + user: User = Depends(user_dep), + ): + """Retrieve paginated list of background jobs""" + if not user.is_superuser: + raise HTTPException(status_code=403, detail="Not Allowed") + + jobs, total = await ops.list_background_jobs( + org=None, + page_size=pageSize, + page=page, + success=success, + job_type=jobType, + sort_by=sortBy, + sort_direction=sortDirection, + ) + return paginated_format(jobs, total, page, pageSize) - @router.get("", response_model=PaginatedBackgroundJobResponse) + @router.get("", response_model=PaginatedBackgroundJobResponse, tags=["jobs"]) async def list_background_jobs( org: Organization = Depends(org_crawl_dep), pageSize: int = DEFAULT_PAGE_SIZE, @@ -759,7 +900,7 @@ async def list_background_jobs( ): """Retrieve paginated list of background jobs""" jobs, total = await ops.list_background_jobs( - org, + org=org, page_size=pageSize, page=page, success=success, diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index b5bb52a33f..ee852bf336 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -170,16 +170,17 @@ async def get_crawl_out( if coll_ids: res["collections"] = await self.colls.get_collection_names(coll_ids) - res["initialPages"], _ = await self.page_ops.list_pages( - crawlid, is_seed=True, page_size=25 - ) - - oid = res.get("oid") - if oid: - res["pagesQueryUrl"] = ( - get_origin(headers) + f"/api/orgs/{oid}/crawls/{crawlid}/pages" + if res.get("version", 1) == 2: + res["initialPages"] = await self.page_ops.list_replay_query_pages( + crawl_ids=[crawlid], is_seed=True, page_size=25 ) + oid = res.get("oid") + if oid: + res["pagesQueryUrl"] = ( + get_origin(headers) + f"/api/orgs/{oid}/crawls/{crawlid}/pages" + ) + crawl = CrawlOutWithResources.from_dict(res) if not skip_resources: diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 2103c61e40..0e263056cc 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -6,10 +6,8 @@ from datetime import datetime from collections import Counter from uuid import UUID, uuid4 -from typing import Optional, List, TYPE_CHECKING, cast, Dict, Tuple, Any, Union +from typing import Optional, List, TYPE_CHECKING, cast, Dict, Any, Union import os -import re -import urllib.parse import asyncio import pymongo @@ -44,16 +42,13 @@ OrgPublicCollections, PublicOrgDetails, CollAccessType, - PageUrlCount, - PageIdTimestamp, - PaginatedPageUrlCountResponse, + PageOut, UpdateCollHomeUrl, User, ImageFile, ImageFilePreparer, MIN_UPLOAD_PART_SIZE, PublicCollOut, - PreloadResource, ) from .utils import dt_now, slug_from_name, get_duplicate_key_error_field, get_origin @@ -347,21 +342,24 @@ async def get_collection_out( result = await self.get_collection_raw(coll_id, public_or_unlisted_only) if resources: - result["resources"], result["preloadResources"] = ( - await self.get_collection_crawl_resources( - coll_id, include_preloads=True - ) + result["resources"], crawl_ids, pages_optimized = ( + await self.get_collection_crawl_resources(coll_id) ) - result["initialPages"], result["totalPages"] = ( - await self.page_ops.list_collection_pages(coll_id, page_size=25) + initial_pages: List[PageOut] = await self.page_ops.list_replay_query_pages( + coll_id, + crawl_ids=crawl_ids, + page_size=25, ) public = "public/" if public_or_unlisted_only else "" - result["pagesQueryUrl"] = ( - get_origin(headers) - + f"/api/orgs/{org.id}/collections/{coll_id}/{public}pages" - ) + + if pages_optimized: + result["initialPages"] = initial_pages + result["pagesQueryUrl"] = ( + get_origin(headers) + + f"/api/orgs/{org.id}/collections/{coll_id}/{public}pages" + ) thumbnail = result.get("thumbnail") if thumbnail: @@ -388,7 +386,7 @@ async def get_public_collection_out( if result.get("access") not in allowed_access: raise HTTPException(status_code=404, detail="collection_not_found") - result["resources"], _ = await self.get_collection_crawl_resources(coll_id) + result["resources"], _, _ = await self.get_collection_crawl_resources(coll_id) thumbnail = result.get("thumbnail") if thumbnail: @@ -487,12 +485,6 @@ async def list_collections( collections: List[Union[CollOut, PublicCollOut]] = [] for res in items: - res["resources"], res["preloadResources"] = ( - await self.get_collection_crawl_resources( - res["_id"], include_preloads=not public_colls_out - ) - ) - thumbnail = res.get("thumbnail") if thumbnail: image_file = ImageFile(**thumbnail) @@ -514,13 +506,14 @@ async def list_collections( return collections, total async def get_collection_crawl_resources( - self, coll_id: UUID, include_preloads=False - ): + self, coll_id: UUID + ) -> tuple[List[CrawlFileOut], List[str], bool]: """Return pre-signed resources for all collection crawl files.""" # Ensure collection exists _ = await self.get_collection_raw(coll_id) resources = [] + pages_optimized = True crawls, _ = await self.crawl_ops.list_all_base_crawls( collection_id=coll_id, @@ -529,38 +522,16 @@ async def get_collection_crawl_resources( cls_type=CrawlOutWithResources, ) + crawl_ids = [] + for crawl in crawls: + crawl_ids.append(crawl.id) if crawl.resources: resources.extend(crawl.resources) + if crawl.version != 2: + pages_optimized = False - preload_resources: List[PreloadResource] = [] - - if include_preloads: - no_page_items = await self.get_collection_resources_with_no_pages(crawls) - for item in no_page_items: - preload_resources.append(item) - - return resources, preload_resources - - async def get_collection_resources_with_no_pages( - self, crawls: List[CrawlOutWithResources] - ) -> List[PreloadResource]: - """Return wacz files in collection that have no pages""" - resources_no_pages: List[PreloadResource] = [] - - for crawl in crawls: - _, page_count = await self.page_ops.list_pages(crawl.id) - if page_count == 0 and crawl.resources: - for resource in crawl.resources: - resources_no_pages.append( - PreloadResource( - name=os.path.basename(resource.name), - crawlId=crawl.id, - hasPages=False, - ) - ) - - return resources_no_pages + return resources, crawl_ids, pages_optimized async def get_collection_names(self, uuids: List[UUID]): """return object of {_id, names} given list of collection ids""" @@ -631,14 +602,11 @@ async def download_collection(self, coll_id: UUID, org: Organization): resp, headers=headers, media_type="application/wacz+zip" ) - async def recalculate_org_collection_counts_tags(self, org: Organization): - """Recalculate counts and tags for collections in org""" - collections, _ = await self.list_collections( - org, - page_size=100_000, - ) - for coll in collections: - await self.update_collection_counts_and_tags(coll.id) + async def recalculate_org_collection_stats(self, org: Organization): + """recalculate counts, tags and dates for all collections in an org""" + async for coll in self.collections.find({"oid": org.id}, projection={"_id": 1}): + await self.update_collection_counts_and_tags(coll.get("_id")) + await self.update_collection_dates(coll.get("_id")) async def update_collection_counts_and_tags(self, collection_id: UUID): """Set current crawl info in config when crawl begins""" @@ -649,9 +617,7 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): tags = [] crawl_ids = [] - - coll = await self.get_collection(collection_id) - org = await self.orgs.get_org_by_id(coll.oid) + preload_resources = [] async for crawl_raw in self.crawls.find({"collectionIds": collection_id}): crawl = BaseCrawl.from_dict(crawl_raw) @@ -663,10 +629,20 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): total_size += file.size try: - _, crawl_page_count = await self.page_ops.list_pages( - crawl.id, org, page_size=1_000_000 + crawl_page_count = await self.pages.count_documents( + {"crawl_id": crawl.id} ) - page_count += crawl_page_count + + if crawl_page_count == 0: + for file in files: + preload_resources.append( + { + "name": os.path.basename(file.filename), + "crawlId": crawl.id, + } + ) + else: + page_count += crawl_page_count # pylint: disable=broad-exception-caught except Exception: pass @@ -689,19 +665,11 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): "uniquePageCount": unique_page_count, "totalSize": total_size, "tags": sorted_tags, + "preloadResources": preload_resources, } }, ) - async def recalculate_org_collection_dates(self, org: Organization): - """Recalculate earliest and latest dates for collections in org""" - collections, _ = await self.list_collections( - org, - page_size=100_000, - ) - for coll in collections: - await self.update_collection_dates(coll.id) - async def update_collection_dates(self, coll_id: UUID): """Update collection earliest and latest dates from page timestamps""" # pylint: disable=too-many-locals @@ -805,81 +773,6 @@ async def get_org_public_collections( return OrgPublicCollections(org=public_org_details, collections=collections) - async def list_urls_in_collection( - self, - coll_id: UUID, - oid: UUID, - url_prefix: Optional[str] = None, - page_size: int = DEFAULT_PAGE_SIZE, - page: int = 1, - ) -> Tuple[List[PageUrlCount], int]: - """List all URLs in collection sorted desc by snapshot count unless prefix is specified""" - # pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements - # Zero-index page for query - page = page - 1 - skip = page_size * page - - crawl_ids = await self.get_collection_crawl_ids(coll_id) - - match_query: dict[str, object] = {"oid": oid, "crawl_id": {"$in": crawl_ids}} - sort_query: dict[str, int] = {"count": -1, "_id": 1} - - if url_prefix: - url_prefix = urllib.parse.unquote(url_prefix) - regex_pattern = f"^{re.escape(url_prefix)}" - match_query["url"] = {"$regex": regex_pattern, "$options": "i"} - sort_query = {"_id": 1} - - aggregate: List[Dict[str, Union[int, object]]] = [{"$match": match_query}] - - aggregate.extend( - [ - { - "$group": { - "_id": "$url", - "pages": {"$push": "$$ROOT"}, - "count": {"$sum": 1}, - }, - }, - {"$sort": sort_query}, - {"$set": {"url": "$_id"}}, - { - "$facet": { - "items": [ - {"$skip": skip}, - {"$limit": page_size}, - ], - "total": [{"$count": "count"}], - } - }, - ] - ) - - # Get total - cursor = self.pages.aggregate(aggregate) - results = await cursor.to_list(length=1) - result = results[0] - items = result["items"] - - try: - total = int(result["total"][0]["count"]) - except (IndexError, ValueError): - total = 0 - - return [ - PageUrlCount( - url=data.get("url", ""), - count=data.get("count", 0), - snapshots=[ - PageIdTimestamp( - pageId=p["_id"], ts=p.get("ts"), status=p.get("status", 200) - ) - for p in data.get("pages", []) - ], - ) - for data in items - ], total - async def set_home_url( self, coll_id: UUID, update: UpdateCollHomeUrl, org: Organization ) -> Dict[str, bool]: @@ -1011,11 +904,13 @@ async def delete_thumbnail(self, coll_id: UUID, org: Organization): # ============================================================================ # pylint: disable=too-many-locals -def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_dep): +def init_collections_api( + app, mdb, orgs, storage_ops, event_webhook_ops, user_dep +) -> CollectionOps: """init collections api""" # pylint: disable=invalid-name, unused-argument, too-many-arguments - colls = CollectionOps(mdb, storage_ops, orgs, event_webhook_ops) + colls: CollectionOps = CollectionOps(mdb, storage_ops, orgs, event_webhook_ops) org_crawl_dep = orgs.org_crawl_dep org_viewer_dep = orgs.org_viewer_dep @@ -1068,7 +963,7 @@ async def get_collection_all(org: Organization = Depends(org_viewer_dep)): try: all_collections, _ = await colls.list_collections(org, page_size=10_000) for collection in all_collections: - results[collection.name], _ = ( + results[collection.name], _, _ = ( await colls.get_collection_crawl_resources(collection.id) ) except Exception as exc: @@ -1268,28 +1163,6 @@ async def download_public_collection( return await colls.download_collection(coll.id, org) - @app.get( - "/orgs/{oid}/collections/{coll_id}/urls", - tags=["collections"], - response_model=PaginatedPageUrlCountResponse, - ) - async def get_collection_url_list( - coll_id: UUID, - oid: UUID, - urlPrefix: Optional[str] = None, - pageSize: int = DEFAULT_PAGE_SIZE, - page: int = 1, - ): - """Retrieve paginated list of urls in collection sorted by snapshot count""" - pages, total = await colls.list_urls_in_collection( - coll_id=coll_id, - oid=oid, - url_prefix=urlPrefix, - page_size=pageSize, - page=page, - ) - return paginated_format(pages, total, page, pageSize) - @app.post( "/orgs/{oid}/collections/{coll_id}/home-url", tags=["collections"], diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index 1ea58e00bf..353f87bd70 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -21,6 +21,7 @@ # ============================================================================ +# pylint: disable=too-many-public-methods class CrawlManager(K8sAPI): """abstract crawl manager""" @@ -128,7 +129,7 @@ async def run_delete_org_job( job_id = f"delete-org-{oid}-{secrets.token_hex(5)}" return await self._run_bg_job_with_ops_classes( - oid, job_id, job_type=BgJobType.DELETE_ORG.value + job_id, job_type=BgJobType.DELETE_ORG.value, oid=oid ) async def run_recalculate_org_stats_job( @@ -144,15 +145,14 @@ async def run_recalculate_org_stats_job( job_id = f"org-stats-{oid}-{secrets.token_hex(5)}" return await self._run_bg_job_with_ops_classes( - oid, - job_id, - job_type=BgJobType.RECALCULATE_ORG_STATS.value, + job_id, job_type=BgJobType.RECALCULATE_ORG_STATS.value, oid=oid ) async def run_re_add_org_pages_job( self, oid: str, crawl_type: Optional[str] = None, + crawl_id: Optional[str] = None, existing_job_id: Optional[str] = None, ) -> str: """run job to recalculate storage stats for the org""" @@ -163,25 +163,45 @@ async def run_re_add_org_pages_job( job_id = f"org-pages-{oid}-{secrets.token_hex(5)}" return await self._run_bg_job_with_ops_classes( - oid, job_id, job_type=BgJobType.READD_ORG_PAGES.value, + oid=oid, crawl_type=crawl_type, + crawl_id=crawl_id, + ) + + async def run_optimize_pages_job( + self, existing_job_id: Optional[str] = None, scale=1 + ) -> str: + """run job to optimize crawl pages""" + + if existing_job_id: + job_id = existing_job_id + else: + job_id = f"optimize-pages-{secrets.token_hex(5)}" + + return await self._run_bg_job_with_ops_classes( + job_id, job_type=BgJobType.OPTIMIZE_PAGES.value, scale=scale ) async def _run_bg_job_with_ops_classes( - self, oid: str, job_id: str, job_type: str, **kwargs + self, + job_id: str, + job_type: str, + oid: Optional[str] = None, + **kwargs, ) -> str: """run background job with access to ops classes""" params = { "id": job_id, - "oid": oid, "job_type": job_type, "backend_image": os.environ.get("BACKEND_IMAGE", ""), "pull_policy": os.environ.get("BACKEND_IMAGE_PULL_POLICY", ""), **kwargs, } + if oid: + params["oid"] = oid data = self.templates.env.get_template("background_job.yaml").render(params) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 2a89e04fa5..01c9cc4150 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -382,6 +382,7 @@ async def add_new_crawl( crawlerChannel=crawlconfig.crawlerChannel, proxyId=crawlconfig.proxyId, image=image, + version=2, ) try: diff --git a/backend/btrixcloud/emailsender.py b/backend/btrixcloud/emailsender.py index e710f99ce9..7651e8dc3f 100644 --- a/backend/btrixcloud/emailsender.py +++ b/backend/btrixcloud/emailsender.py @@ -154,9 +154,9 @@ def send_user_forgot_password(self, receiver_email, token, headers=None): def send_background_job_failed( self, job: Union[CreateReplicaJob, DeleteReplicaJob], - org: Organization, finished: datetime, receiver_email: str, + org: Optional[Organization] = None, ): """Send background job failed email to superuser""" self._send_encrypted( diff --git a/backend/btrixcloud/main_bg.py b/backend/btrixcloud/main_bg.py index 024f61b45d..ff80fb4bef 100644 --- a/backend/btrixcloud/main_bg.py +++ b/backend/btrixcloud/main_bg.py @@ -13,10 +13,12 @@ job_type = os.environ.get("BG_JOB_TYPE") oid = os.environ.get("OID") crawl_type = os.environ.get("CRAWL_TYPE") +crawl_id = os.environ.get("CRAWL_ID") # ============================================================================ # pylint: disable=too-many-function-args, duplicate-code, too-many-locals, too-many-return-statements +# pylint: disable=too-many-branches async def main(): """run background job with access to ops classes""" @@ -30,6 +32,17 @@ async def main(): (org_ops, _, _, _, _, page_ops, coll_ops, _, _, _, _, user_manager) = init_ops() + # Run job (generic) + if job_type == BgJobType.OPTIMIZE_PAGES: + try: + await page_ops.optimize_crawl_pages(version=2) + return 0 + # pylint: disable=broad-exception-caught + except Exception: + traceback.print_exc() + return 1 + + # Run job (org-specific) if not oid: print("Org id missing, quitting") return 1 @@ -39,7 +52,6 @@ async def main(): print("Org id invalid, quitting") return 1 - # Run job if job_type == BgJobType.DELETE_ORG: try: await org_ops.delete_org_and_data(org, user_manager) @@ -60,9 +72,12 @@ async def main(): if job_type == BgJobType.READD_ORG_PAGES: try: - await page_ops.re_add_all_crawl_pages(org, crawl_type=crawl_type) - await coll_ops.recalculate_org_collection_dates(org) - await coll_ops.recalculate_org_collection_counts_tags(org) + if not crawl_id: + await page_ops.re_add_all_crawl_pages(org, crawl_type=crawl_type) + else: + await page_ops.re_add_crawl_pages(crawl_id=crawl_id, oid=org.id) + + await coll_ops.recalculate_org_collection_stats(org) return 0 # pylint: disable=broad-exception-caught except Exception: diff --git a/backend/btrixcloud/migrations/migration_0037_upload_pages.py b/backend/btrixcloud/migrations/migration_0037_upload_pages.py index 1c7e4a80a3..8f896c5c92 100644 --- a/backend/btrixcloud/migrations/migration_0037_upload_pages.py +++ b/backend/btrixcloud/migrations/migration_0037_upload_pages.py @@ -59,8 +59,7 @@ async def migrate_up(self): async for org_dict in mdb_orgs.find({}): org = Organization.from_dict(org_dict) try: - await self.coll_ops.recalculate_org_collection_dates(org) - await self.coll_ops.recalculate_org_collection_counts_tags(org) + await self.coll_ops.recalculate_org_collection_stats(org) # pylint: disable=broad-exception-caught except Exception as err: print( diff --git a/backend/btrixcloud/migrations/migration_0042_page_filenames.py b/backend/btrixcloud/migrations/migration_0042_page_filenames.py index 5410d4b593..3a5b5723cc 100644 --- a/backend/btrixcloud/migrations/migration_0042_page_filenames.py +++ b/backend/btrixcloud/migrations/migration_0042_page_filenames.py @@ -15,36 +15,26 @@ class Migration(BaseMigration): def __init__(self, mdb, **kwargs): super().__init__(mdb, migration_version=MIGRATION_VERSION) - self.page_ops = kwargs.get("page_ops") + self.background_job_ops = kwargs.get("background_job_ops") async def migrate_up(self): """Perform migration up. - Add filename to all pages that don't currently have it stored, - iterating through each archived item and its WACZ files as necessary + Optimize crawl pages for optimized replay in background job by adding + filename, isSeed, depth, and favIconUrl as needed. """ - pages_mdb = self.mdb["pages"] - - if self.page_ops is None: + if self.background_job_ops is None: print( - "Unable to add filename and other fields to pages, missing page_ops", + "Unable to start background job to optimize pages, ops class missing", flush=True, ) return - crawl_ids_to_update = await pages_mdb.distinct("crawl_id", {"filename": None}) - - crawl_count = len(crawl_ids_to_update) - current_index = 1 - - for crawl_id in crawl_ids_to_update: - print(f"Migrating archived item {current_index}/{crawl_count}", flush=True) - try: - await self.page_ops.add_crawl_wacz_filename_to_pages(crawl_id) - # pylint: disable=broad-exception-caught - except Exception as err: - print( - f"Error adding filename and other fields to pages in item {crawl_id}: {err}", - flush=True, - ) - current_index += 1 + try: + await self.background_job_ops.create_optimize_crawl_pages_job() + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Unable to start background job to optimize pages: {err}", + flush=True, + ) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 058b7927fc..e8d7b20ade 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -805,6 +805,9 @@ class BaseCrawl(CoreCrawlable, BaseMongoModel): filePageCount: Optional[int] = 0 errorPageCount: Optional[int] = 0 + isMigrating: Optional[bool] = None + version: Optional[int] = None + # ============================================================================ class CollIdName(BaseModel): @@ -882,6 +885,10 @@ class CrawlOut(BaseMongoModel): filePageCount: Optional[int] = 0 errorPageCount: Optional[int] = 0 + # Set to older version by default, crawls with optimized + # pages will have this explicitly set to 2 + version: Optional[int] = 1 + # ============================================================================ class UpdateCrawl(BaseModel): @@ -1378,7 +1385,6 @@ class CrawlOutWithResources(CrawlOut): collections: Optional[List[CollIdName]] = [] initialPages: List[PageOut] = [] - totalPages: Optional[int] = None pagesQueryUrl: str = "" @@ -1411,7 +1417,6 @@ class PreloadResource(BaseModel): name: str crawlId: str - hasPages: bool # ============================================================================ @@ -1508,7 +1513,6 @@ class CollOut(BaseMongoModel): allowPublicDownload: bool = True initialPages: List[PageOut] = [] - totalPages: Optional[int] = None preloadResources: List[PreloadResource] = [] pagesQueryUrl: str = "" @@ -2110,9 +2114,6 @@ class OrgMetrics(BaseModel): pageCount: int crawlPageCount: int uploadPageCount: int - uniquePageCount: int - crawlUniquePageCount: int - uploadUniquePageCount: int profileCount: int workflowsRunningCount: int maxConcurrentCrawls: int @@ -2533,6 +2534,7 @@ class BgJobType(str, Enum): DELETE_ORG = "delete-org" RECALCULATE_ORG_STATS = "recalculate-org-stats" READD_ORG_PAGES = "readd-org-pages" + OPTIMIZE_PAGES = "optimize-pages" # ============================================================================ @@ -2541,7 +2543,7 @@ class BackgroundJob(BaseMongoModel): id: str type: BgJobType - oid: UUID + oid: Optional[UUID] = None success: Optional[bool] = None started: datetime finished: Optional[datetime] = None @@ -2588,10 +2590,18 @@ class RecalculateOrgStatsJob(BackgroundJob): # ============================================================================ class ReAddOrgPagesJob(BackgroundJob): - """Model for tracking jobs to readd an org's pages""" + """Model for tracking jobs to readd pages for an org or single crawl""" type: Literal[BgJobType.READD_ORG_PAGES] = BgJobType.READD_ORG_PAGES crawl_type: Optional[str] = None + crawl_id: Optional[str] = None + + +# ============================================================================ +class OptimizePagesJob(BackgroundJob): + """Model for tracking jobs to optimize pages across all orgs""" + + type: Literal[BgJobType.OPTIMIZE_PAGES] = BgJobType.OPTIMIZE_PAGES # ============================================================================ @@ -2605,6 +2615,7 @@ class ReAddOrgPagesJob(BackgroundJob): DeleteOrgJob, RecalculateOrgStatsJob, ReAddOrgPagesJob, + OptimizePagesJob, ] ] @@ -2808,6 +2819,13 @@ class PaginatedPageOutResponse(PaginatedResponse): items: List[PageOut] +# ============================================================================ +class PageOutItemsResponse(BaseModel): + """Response model for pages without total""" + + items: List[PageOut] + + # ============================================================================ class PaginatedPageOutWithQAResponse(PaginatedResponse): """Response model for paginated pages with single QA info""" @@ -2858,7 +2876,7 @@ class PaginatedUserEmailsResponse(PaginatedResponse): # ============================================================================ -class PaginatedPageUrlCountResponse(PaginatedResponse): +class PageUrlCountResponse(BaseModel): """Response model for page count by url""" items: List[PageUrlCount] diff --git a/backend/btrixcloud/operator/bgjobs.py b/backend/btrixcloud/operator/bgjobs.py index 4538582c08..dc7cb63d77 100644 --- a/backend/btrixcloud/operator/bgjobs.py +++ b/backend/btrixcloud/operator/bgjobs.py @@ -49,9 +49,15 @@ async def finalize_background_job(self, data: MCDecoratorSyncData) -> dict: if not finished: finished = dt_now() + try: + org_id = UUID(oid) + # pylint: disable=broad-except + except Exception: + org_id = None + try: await self.background_job_ops.job_finished( - job_id, job_type, UUID(oid), success=success, finished=finished + job_id, job_type, success=success, finished=finished, oid=org_id ) # print( # f"{job_type} background job completed: success: {success}, {job_id}", diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 7c9f7558c0..ad915e9734 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -947,9 +947,6 @@ async def get_org_metrics(self, org: Organization) -> dict[str, int]: crawl_page_count = 0 upload_page_count = 0 - crawl_ids = [] - upload_ids = [] - async for item_data in self.crawls_db.find({"oid": org.id}): item = BaseCrawl.from_dict(item_data) if item.state not in SUCCESSFUL_STATES: @@ -958,22 +955,12 @@ async def get_org_metrics(self, org: Organization) -> dict[str, int]: if item.type == "crawl": crawl_count += 1 crawl_page_count += item.pageCount or 0 - crawl_ids.append(item.id) if item.type == "upload": upload_count += 1 upload_page_count += item.pageCount or 0 - upload_ids.append(item.id) if item.pageCount: page_count += item.pageCount - all_archived_item_ids = crawl_ids + upload_ids - - unique_page_count = await self.page_ops.get_unique_page_count( - all_archived_item_ids - ) - crawl_unique_page_count = await self.page_ops.get_unique_page_count(crawl_ids) - upload_unique_page_count = await self.page_ops.get_unique_page_count(upload_ids) - profile_count = await self.profiles_db.count_documents({"oid": org.id}) workflows_running_count = await self.crawls_db.count_documents( {"oid": org.id, "state": {"$in": RUNNING_STATES}} @@ -998,9 +985,6 @@ async def get_org_metrics(self, org: Organization) -> dict[str, int]: "pageCount": page_count, "crawlPageCount": crawl_page_count, "uploadPageCount": upload_page_count, - "uniquePageCount": unique_page_count, - "crawlUniquePageCount": crawl_unique_page_count, - "uploadUniquePageCount": upload_unique_page_count, "profileCount": profile_count, "workflowsRunningCount": workflows_running_count, "maxConcurrentCrawls": max_concurrent_crawls, diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index b1ad1ee6d1..fc293508d4 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -3,7 +3,6 @@ # pylint: disable=too-many-lines import asyncio -import os import re import traceback import urllib.parse @@ -20,8 +19,12 @@ PageOutWithSingleQA, PageReviewUpdate, PageQACompare, + PageIdTimestamp, + PageUrlCount, + PageUrlCountResponse, Organization, PaginatedPageOutResponse, + PageOutItemsResponse, PaginatedPageOutWithQAResponse, User, PageNote, @@ -30,7 +33,6 @@ PageNoteDelete, QARunBucketStats, StartedResponse, - StartedResponseBool, UpdatedResponse, DeletedResponse, PageNoteAddedResponse, @@ -76,18 +78,34 @@ def __init__( async def init_index(self): """init index for pages db collection""" await self.pages.create_index([("crawl_id", pymongo.HASHED)]) + await self.pages.create_index( + [ + ("crawl_id", pymongo.HASHED), + ("isSeed", pymongo.DESCENDING), + ("ts", pymongo.ASCENDING), + ] + ) + await self.pages.create_index( + [ + ("crawl_id", pymongo.HASHED), + ("url", pymongo.ASCENDING), + ] + ) + await self.pages.create_index([("title", "text")]) async def set_ops(self, background_job_ops: BackgroundJobOps): """Set ops classes as needed""" self.background_job_ops = background_job_ops - async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100): + async def add_crawl_pages_to_db_from_wacz( + self, crawl_id: str, batch_size=100, num_retries=5 + ): """Add pages to database from WACZ files""" pages_buffer: List[Page] = [] + crawl = await self.crawl_ops.get_crawl_out(crawl_id) try: - crawl = await self.crawl_ops.get_crawl_out(crawl_id) stream = await self.storage_ops.sync_stream_wacz_pages( - crawl.resources or [] + crawl.resources or [], num_retries ) new_uuid = crawl.type == "upload" seed_count = 0 @@ -104,7 +122,13 @@ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100): non_seed_count += 1 if len(pages_buffer) > batch_size: - await self._add_pages_to_db(crawl_id, pages_buffer) + try: + await self._add_pages_to_db( + crawl_id, pages_buffer, ordered=False + ) + # pylint: disable=broad-exception-caught + except Exception as e: + print("Error inserting, probably dupe", e) pages_buffer = [] pages_buffer.append( @@ -113,65 +137,24 @@ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100): # Add any remaining pages in buffer to db if pages_buffer: - await self._add_pages_to_db(crawl_id, pages_buffer) + try: + await self._add_pages_to_db(crawl_id, pages_buffer, ordered=False) + # pylint: disable=broad-exception-caught + except Exception as e: + print("Error inserting, probably dupe", e) await self.set_archived_item_page_counts(crawl_id) print( - f"Added pages for crawl {crawl_id}: {seed_count} Seed, {non_seed_count} Non-Seed", + f"Added pages for crawl {crawl_id}: " + + f"{seed_count} Seed, {non_seed_count} Non-Seed", flush=True, ) - # pylint: disable=broad-exception-caught, raise-missing-from - except Exception as err: - traceback.print_exc() - print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True) - - async def add_crawl_wacz_filename_to_pages(self, crawl_id: str): - """Add WACZ filename and additional fields to existing pages in crawl if not already set""" - try: - crawl = await self.crawl_ops.get_crawl_out(crawl_id) - if not crawl.resources: - return - - for wacz_file in crawl.resources: - # Strip oid directory from filename - filename = os.path.basename(wacz_file.name) - - stream = await self.storage_ops.sync_stream_wacz_pages([wacz_file]) - for page_dict in stream: - if not page_dict.get("url"): - continue - page_id = page_dict.get("id") - - if not page_id: - continue - - if page_id: - try: - page_id = UUID(page_id) - # pylint: disable=broad-exception-caught - except Exception: - continue - - await self.pages.find_one_and_update( - {"_id": page_id}, - { - "$set": { - "filename": filename, - "depth": page_dict.get("depth"), - "isSeed": page_dict.get("seed", False), - "favIconUrl": page_dict.get("favIconUrl"), - } - }, - ) # pylint: disable=broad-exception-caught, raise-missing-from except Exception as err: traceback.print_exc() - print( - f"Error adding filename to pages from item {crawl_id} to db: {err}", - flush=True, - ) + print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True) def _get_page_from_dict( self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID, new_uuid: bool @@ -207,7 +190,7 @@ def _get_page_from_dict( p.compute_page_type() return p - async def _add_pages_to_db(self, crawl_id: str, pages: List[Page]): + async def _add_pages_to_db(self, crawl_id: str, pages: List[Page], ordered=True): """Add batch of pages to db in one insert""" result = await self.pages.insert_many( [ @@ -215,7 +198,8 @@ async def _add_pages_to_db(self, crawl_id: str, pages: List[Page]): exclude_unset=True, exclude_none=True, exclude_defaults=True ) for page in pages - ] + ], + ordered=ordered, ) if not result.inserted_ids: # pylint: disable=broad-exception-raised @@ -675,9 +659,65 @@ async def list_pages( return [PageOut.from_dict(data) for data in items], total - async def list_collection_pages( + async def list_page_url_counts( self, coll_id: UUID, + url_prefix: Optional[str] = None, + page_size: int = DEFAULT_PAGE_SIZE, + ) -> List[PageUrlCount]: + """List all page URLs in collection sorted desc by snapshot count + unless prefix is specified""" + # pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements + # Zero-index page for query + + crawl_ids = await self.coll_ops.get_collection_crawl_ids(coll_id) + + match_query: dict[str, object] = {"crawl_id": {"$in": crawl_ids}} + sort_query: dict[str, int] = {"isSeed": -1, "ts": 1} + + if url_prefix: + url_prefix = urllib.parse.unquote(url_prefix) + # regex_pattern = f"^{re.escape(url_prefix)}" + # match_query["url"] = {"$regex": regex_pattern, "$options": "i"} + match_query["url"] = {"$gte": url_prefix} + sort_query = {"url": 1} + + aggregate: List[Dict[str, Union[int, object]]] = [ + {"$match": match_query}, + {"$sort": sort_query}, + ] + + aggregate.append({"$limit": page_size * len(crawl_ids)}) + + cursor = self.pages.aggregate(aggregate) + results = await cursor.to_list(length=page_size * len(crawl_ids)) + + url_counts: dict[str, PageUrlCount] = {} + + for result in results: + url = result.get("url") + count = url_counts.get(url) + if not count: + # if already at max pages, this would add a new page, so we're done + if len(url_counts) >= page_size: + break + count = PageUrlCount(url=url, snapshots=[], count=0) + url_counts[url] = count + count.snapshots.append( + PageIdTimestamp( + pageId=result.get("_id"), + ts=result.get("ts"), + status=result.get("status", 200), + ) + ) + count.count += 1 + + return list(url_counts.values()) + + async def list_replay_query_pages( + self, + coll_id: Optional[UUID] = None, + crawl_ids: Optional[List[str]] = None, org: Optional[Organization] = None, search: Optional[str] = None, url: Optional[str] = None, @@ -690,16 +730,22 @@ async def list_collection_pages( sort_by: Optional[str] = None, sort_direction: Optional[int] = -1, public_or_unlisted_only=False, - ) -> Tuple[Union[List[PageOut], List[PageOutWithSingleQA]], int]: - """List all pages in collection, with optional filtering""" + ) -> List[PageOut]: + """Query pages in collection, with filtering sorting. No total returned for optimization""" # pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements # Zero-index page for query page = page - 1 skip = page_size * page - crawl_ids = await self.coll_ops.get_collection_crawl_ids( - coll_id, public_or_unlisted_only - ) + if crawl_ids is None and coll_id is None: + raise HTTPException( + status_code=400, detail="either crawl_ids or coll_id must be provided" + ) + + if coll_id and crawl_ids is None: + crawl_ids = await self.coll_ops.get_collection_crawl_ids( + coll_id, public_or_unlisted_only + ) query: dict[str, object] = { "crawl_id": {"$in": crawl_ids}, @@ -707,12 +753,14 @@ async def list_collection_pages( if org: query["oid"] = org.id + is_text_search = False if search: - search_regex = re.escape(urllib.parse.unquote(search)) - query["$or"] = [ - {"url": {"$regex": search_regex, "$options": "i"}}, - {"title": {"$regex": search_regex, "$options": "i"}}, - ] + search = urllib.parse.unquote(search) + if search.startswith("http:") or search.startswith("https:"): + query["url"] = {"$gte": search} + else: + query["$text"] = {"$search": search} + is_text_search = True elif url_prefix: url_prefix = urllib.parse.unquote(url_prefix) @@ -731,7 +779,7 @@ async def list_collection_pages( if isinstance(depth, int): query["depth"] = depth - aggregate = [{"$match": query}] + aggregate: list[dict[str, object]] = [{"$match": query}] if sort_by: # Sorting options to add: @@ -753,38 +801,30 @@ async def list_collection_pages( raise HTTPException(status_code=400, detail="invalid_sort_direction") aggregate.extend([{"$sort": {sort_by: sort_direction}}]) + elif search: + if is_text_search: + aggregate.extend( + [ + {"$sort": {"score": {"$meta": "textScore"}}}, + ] + ) + else: + aggregate.extend([{"$sort": {"url": 1}}]) else: # default sort: seeds first, then by timestamp aggregate.extend([{"$sort": {"isSeed": -1, "ts": 1}}]) - aggregate.extend( - [ - { - "$facet": { - "items": [ - {"$skip": skip}, - {"$limit": page_size}, - ], - "total": [{"$count": "count"}], - } - }, - ] - ) + if skip: + aggregate.append({"$skip": skip}) + aggregate.append({"$limit": page_size}) - # Get total cursor = self.pages.aggregate(aggregate) - results = await cursor.to_list(length=1) - result = results[0] - items = result["items"] - try: - total = int(result["total"][0]["count"]) - except (IndexError, ValueError): - total = 0 + results = await cursor.to_list(length=page_size) - return [PageOut.from_dict(data) for data in items], total + return [PageOut.from_dict(data) for data in results] - async def re_add_crawl_pages(self, crawl_id: str, oid: UUID): + async def re_add_crawl_pages(self, crawl_id: str, oid: Optional[UUID] = None): """Delete existing pages for crawl and re-add from WACZs.""" try: @@ -954,10 +994,15 @@ def get_crawl_type_from_pages_route(self, request: Request): async def get_unique_page_count(self, crawl_ids: List[str]) -> int: """Get count of unique page URLs across list of archived items""" - unique_pages = await self.pages.distinct( - "url", {"crawl_id": {"$in": crawl_ids}} + cursor = self.pages.aggregate( + [ + {"$match": {"crawl_id": {"$in": crawl_ids}}}, + {"$group": {"_id": "$url"}}, + {"$count": "urls"}, + ] ) - return len(unique_pages) or 0 + res = await cursor.to_list(1) + return res[0].get("urls") if res else 0 async def set_archived_item_page_counts(self, crawl_id: str): """Store archived item page and unique page counts in crawl document""" @@ -970,12 +1015,82 @@ async def set_archived_item_page_counts(self, crawl_id: str): {"$set": {"uniquePageCount": unique_page_count, "pageCount": page_count}}, ) + async def optimize_crawl_pages(self, version: int = 2): + """Iterate through crawls, optimizing pages""" + + async def process_finished_crawls(): + while True: + # Pull new finished crawl and set isMigrating + match_query = { + "version": {"$ne": version}, + "isMigrating": {"$ne": True}, + "finished": {"$ne": None}, + } + + next_crawl = await self.crawls.find_one_and_update( + match_query, + {"$set": {"isMigrating": True}}, + sort=[("finished", -1)], + ) + if next_crawl is None: + print("No more finished crawls to migrate") + break + + crawl_id = next_crawl.get("_id") + print("Processing crawl: " + crawl_id) + + # Re-add crawl pages if at least one page doesn't have filename set + has_page_no_filename = await self.pages.find_one( + {"crawl_id": crawl_id, "filename": None} + ) + if has_page_no_filename: + print("Re-importing pages to migrate to v2") + await self.re_add_crawl_pages(crawl_id) + else: + print("Pages already have filename, set to v2") + + # Update crawl version and unset isMigrating + await self.crawls.find_one_and_update( + {"_id": crawl_id}, + {"$set": {"version": version, "isMigrating": False}}, + ) + + await process_finished_crawls() + + # Wait for running crawls from before migration to finish, and then process + # again when they're done to make sure everything's been handled + while True: + match_query = { + "version": {"$ne": version}, + "finished": None, + } + running_crawl = await self.crawls.find_one(match_query) + + if not running_crawl: + print("No running crawls remain") + break + + print("Running crawls remain, waiting for them to finish") + await asyncio.sleep(30) + + await process_finished_crawls() + + # Wait until all pods are fully done before returning. For k8s job + # parallelism to work as expected, pods must only return exit code 0 + # once the work in all pods is fully complete. + while True: + in_progress = await self.crawls.find_one({"isMigrating": True}) + if in_progress is None: + break + print("Unmigrated crawls remain, finishing job") + await asyncio.sleep(5) + # ============================================================================ # pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme def init_pages_api( app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops, user_dep -): +) -> PageOps: """init pages API""" # pylint: disable=invalid-name @@ -1018,25 +1133,27 @@ async def re_add_all_crawl_pages( @app.post( "/orgs/{oid}/crawls/{crawl_id}/pages/reAdd", tags=["pages", "crawls"], - response_model=StartedResponseBool, + response_model=StartedResponse, ) @app.post( "/orgs/{oid}/uploads/{crawl_id}/pages/reAdd", tags=["pages", "uploads"], - response_model=StartedResponseBool, + response_model=StartedResponse, ) @app.post( "/orgs/{oid}/all-crawls/{crawl_id}/pages/reAdd", tags=["pages", "all-crawls"], - response_model=StartedResponseBool, + response_model=StartedResponse, ) async def re_add_crawl_pages( crawl_id: str, org: Organization = Depends(org_crawl_dep), ): """Re-add pages for crawl (may delete page QA data!)""" - asyncio.create_task(ops.re_add_crawl_pages(crawl_id, org.id)) - return {"started": True} + job_id = await ops.background_job_ops.create_re_add_org_pages_job( + org.id, crawl_id=crawl_id + ) + return {"started": job_id or ""} @app.get( "/orgs/{oid}/crawls/{crawl_id}/pages/{page_id}", @@ -1195,7 +1312,7 @@ async def get_crawl_pages_list( @app.get( "/orgs/{oid}/collections/{coll_id}/public/pages", tags=["pages", "collections"], - response_model=PaginatedPageOutResponse, + response_model=PageOutItemsResponse, ) async def get_public_collection_pages_list( coll_id: UUID, @@ -1213,7 +1330,7 @@ async def get_public_collection_pages_list( sortDirection: Optional[int] = -1, ): """Retrieve paginated list of pages in collection""" - pages, total = await ops.list_collection_pages( + pages = await ops.list_replay_query_pages( coll_id=coll_id, org=org, search=search, @@ -1231,7 +1348,7 @@ async def get_public_collection_pages_list( response.headers["Access-Control-Allow-Origin"] = "*" response.headers["Access-Control-Allow-Headers"] = "*" - return paginated_format(pages, total, page, pageSize) + return {"items": pages} @app.options( "/orgs/{oid}/collections/{coll_id}/pages", @@ -1252,7 +1369,7 @@ async def get_replay_preflight(response: Response): @app.get( "/orgs/{oid}/collections/{coll_id}/pages", tags=["pages", "collections"], - response_model=PaginatedPageOutResponse, + response_model=PageOutItemsResponse, ) async def get_collection_pages_list( coll_id: UUID, @@ -1270,7 +1387,7 @@ async def get_collection_pages_list( sortDirection: Optional[int] = -1, ): """Retrieve paginated list of pages in collection""" - pages, total = await ops.list_collection_pages( + pages = await ops.list_replay_query_pages( coll_id=coll_id, org=org, search=search, @@ -1286,7 +1403,7 @@ async def get_collection_pages_list( ) response.headers["Access-Control-Allow-Origin"] = "*" response.headers["Access-Control-Allow-Headers"] = "*" - return paginated_format(pages, total, page, pageSize) + return {"items": pages} @app.get( "/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/pages", @@ -1334,4 +1451,24 @@ async def get_pages_list_with_qa( ) return paginated_format(pages, total, page, pageSize) + @app.get( + "/orgs/{oid}/collections/{coll_id}/pageUrlCounts", + tags=["collections"], + response_model=PageUrlCountResponse, + ) + async def get_collection_url_list( + coll_id: UUID, + # oid: UUID, + urlPrefix: Optional[str] = None, + pageSize: int = DEFAULT_PAGE_SIZE, + # page: int = 1, + ): + """Retrieve paginated list of urls in collection sorted by snapshot count""" + pages = await ops.list_page_url_counts( + coll_id=coll_id, + url_prefix=urlPrefix, + page_size=pageSize, + ) + return {"items": pages} + return ops diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 620c4b293c..768e255d7e 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -18,6 +18,7 @@ from itertools import chain import asyncio +import time import heapq import zlib import json @@ -495,12 +496,14 @@ async def _delete_file( return status_code == 204 async def sync_stream_wacz_pages( - self, wacz_files: List[CrawlFileOut] + self, wacz_files: List[CrawlFileOut], num_retries=5 ) -> Iterator[Dict[Any, Any]]: """Return stream of pages specified WACZ""" loop = asyncio.get_event_loop() - resp = await loop.run_in_executor(None, self._sync_get_pages, wacz_files) + resp = await loop.run_in_executor( + None, self._sync_get_pages, wacz_files, num_retries + ) return resp @@ -600,8 +603,7 @@ def organize_based_on_instance_number( return stream_json_lines(heap_iter, log_levels, contexts) def _sync_get_pages( - self, - wacz_files: List[CrawlFileOut], + self, wacz_files: List[CrawlFileOut], num_retries=5 ) -> Iterator[Dict[Any, Any]]: """Generate stream of page dicts from specified WACZs""" @@ -627,28 +629,44 @@ def stream_page_lines( page_json["seed"] = True yield page_json - page_generators: List[Iterator[Dict[Any, Any]]] = [] + count = 0 + total = len(wacz_files) for wacz_file in wacz_files: wacz_url = self.resolve_internal_access_path(wacz_file.path) - with RemoteZip(wacz_url) as remote_zip: - page_files: List[ZipInfo] = [ - f - for f in remote_zip.infolist() - if f.filename.startswith("pages/") - and f.filename.endswith(".jsonl") - and not f.is_dir() - ] - for pagefile_zipinfo in page_files: - page_generators.append( - stream_page_lines( - pagefile_zipinfo, - wacz_url, - wacz_file.name, - ) - ) - return chain.from_iterable(page_generators) + retry = 0 + count += 1 + + print(f" Processing {count} of {total} WACZ {wacz_url}") + + while True: + try: + with RemoteZip(wacz_url) as remote_zip: + page_files: List[ZipInfo] = [ + f + for f in remote_zip.infolist() + if f.filename.startswith("pages/") + and f.filename.endswith(".jsonl") + and not f.is_dir() + ] + for pagefile_zipinfo in page_files: + yield from stream_page_lines( + pagefile_zipinfo, + wacz_url, + wacz_file.name, + ) + except Exception as exc: + msg = str(exc) + if retry < num_retries: + retry += 1 + print(f"Retrying, {retry} of {num_retries}, {msg}") + time.sleep(30) + continue + + print(f"No more retries for error: {msg}, skipping {wacz_url}") + + break def _sync_get_filestream(self, wacz_url: str, filename: str) -> Iterator[bytes]: """Return iterator of lines in remote file as bytes""" diff --git a/backend/btrixcloud/uploads.py b/backend/btrixcloud/uploads.py index 3324efdd89..80771f3c17 100644 --- a/backend/btrixcloud/uploads.py +++ b/backend/btrixcloud/uploads.py @@ -178,6 +178,7 @@ async def _create_upload( fileSize=file_size, started=now, finished=now, + version=2, ) # result = await self.crawls.insert_one(uploaded.to_dict()) diff --git a/backend/btrixcloud/version.py b/backend/btrixcloud/version.py index fd88c44e5a..9644fe3fe2 100644 --- a/backend/btrixcloud/version.py +++ b/backend/btrixcloud/version.py @@ -1,3 +1,3 @@ """current version""" -__version__ = "1.14.0-beta.1" +__version__ = "1.14.0-beta.5" diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index 3bace9eb08..e3621b0580 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -436,8 +436,7 @@ def test_collection_public(crawler_auth_headers, default_org_id): ) assert r.status_code == 200 data = r.json() - assert data["total"] > 0 - assert data["items"] + assert len(data["items"]) > 0 # make unlisted and test replay headers r = requests.patch( @@ -619,7 +618,7 @@ def test_list_pages_in_collection(crawler_auth_headers, default_org_id): assert r.status_code == 200 data = r.json() - assert data["total"] >= 0 + assert len(data["items"]) >= 0 pages = data["items"] assert pages @@ -644,18 +643,19 @@ def test_list_pages_in_collection(crawler_auth_headers, default_org_id): coll_page_ts = coll_page["ts"] coll_page_title = coll_page["title"] - # Test search filter - partial_title = coll_page_title[:5] + # Test search filter, make sure text search isn't case sensitive + partial_title = "Archiving" + partial_title_lower = partial_title.lower() partial_url = coll_page_url[:8] r = requests.get( - f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?search={partial_title}", + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?search={partial_title_lower}", headers=crawler_auth_headers, ) assert r.status_code == 200 data = r.json() - assert data["total"] >= 1 + assert len(data["items"]) >= 1 for matching_page in data["items"]: assert ( partial_title in matching_page["title"] @@ -669,7 +669,7 @@ def test_list_pages_in_collection(crawler_auth_headers, default_org_id): assert r.status_code == 200 data = r.json() - assert data["total"] >= 1 + assert len(data["items"]) >= 1 for matching_page in data["items"]: assert ( partial_title in matching_page["title"] @@ -684,7 +684,7 @@ def test_list_pages_in_collection(crawler_auth_headers, default_org_id): assert r.status_code == 200 data = r.json() - assert data["total"] >= 1 + assert len(data["items"]) >= 1 for matching_page in data["items"]: assert matching_page["url"] == coll_page_url @@ -696,7 +696,7 @@ def test_list_pages_in_collection(crawler_auth_headers, default_org_id): assert r.status_code == 200 data = r.json() - assert data["total"] >= 1 + assert len(data["items"]) >= 1 for matching_page in data["items"]: assert matching_page["url"] == coll_page_url assert matching_page["ts"] == coll_page_ts @@ -710,7 +710,7 @@ def test_list_pages_in_collection(crawler_auth_headers, default_org_id): assert r.status_code == 200 data = r.json() - assert data["total"] >= 1 + assert len(data["items"]) >= 1 found_matching_page = False for page in data["items"]: @@ -1182,13 +1182,13 @@ def test_set_collection_home_url( def test_collection_url_list(crawler_auth_headers, default_org_id): r = requests.get( - f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/urls", + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/pageUrlCounts", headers=crawler_auth_headers, ) assert r.status_code == 200 data = r.json() - assert data["total"] >= 1 + assert len(data["items"]) >= 1 urls = data["items"] assert urls @@ -1289,7 +1289,6 @@ def test_list_public_colls_home_url_thumbnail(): assert coll["name"] assert coll["created"] assert coll["modified"] - assert coll["resources"] assert coll["dateEarliest"] assert coll["dateLatest"] assert coll["crawlCount"] > 0 diff --git a/backend/test/test_org.py b/backend/test/test_org.py index a5e3a4cf48..57c0b8fcce 100644 --- a/backend/test/test_org.py +++ b/backend/test/test_org.py @@ -569,9 +569,6 @@ def test_org_metrics(crawler_auth_headers, default_org_id): assert data["uploadCount"] >= 0 assert data["archivedItemCount"] == data["crawlCount"] + data["uploadCount"] assert data["pageCount"] > 0 - assert data["uniquePageCount"] > 0 - assert data["crawlUniquePageCount"] > 0 - assert data["uploadUniquePageCount"] >= 0 assert data["profileCount"] >= 0 assert data["workflowsRunningCount"] >= 0 assert data["workflowsQueuedCount"] >= 0 diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index 5e454d430d..d63bb0bfb4 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -237,6 +237,7 @@ def test_crawl_info(admin_auth_headers, default_org_id): assert data["fileSize"] == wacz_size assert data["fileCount"] == 1 assert data["userName"] + assert data["version"] == 2 def test_crawls_include_seed_info(admin_auth_headers, default_org_id): diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py index e565a812a0..1dea54ea23 100644 --- a/backend/test/test_uploads.py +++ b/backend/test/test_uploads.py @@ -207,6 +207,7 @@ def test_get_upload_replay_json( assert data["resources"][0]["hash"] assert data["errors"] == [] assert "files" not in data + assert data["version"] == 2 def test_get_upload_replay_json_admin( @@ -230,6 +231,7 @@ def test_get_upload_replay_json_admin( assert data["resources"][0]["hash"] assert data["errors"] == [] assert "files" not in data + assert data["version"] == 2 def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id): @@ -518,6 +520,7 @@ def test_list_all_crawls( assert item["started"] assert item["finished"] assert item["state"] + assert item["version"] == 2 # Test that all-crawls lastQAState and lastQAStarted sorts always puts crawls before uploads r = requests.get( @@ -923,6 +926,7 @@ def test_get_upload_replay_json_from_all_crawls( assert data["resources"][0]["hash"] assert data["errors"] == [] assert "files" not in data + assert data["version"] == 2 def test_get_upload_replay_json_admin_from_all_crawls( @@ -944,6 +948,7 @@ def test_get_upload_replay_json_admin_from_all_crawls( assert data["resources"][0]["hash"] assert data["errors"] == [] assert "files" not in data + assert data["version"] == 2 def test_update_upload_metadata_all_crawls( diff --git a/chart/Chart.yaml b/chart/Chart.yaml index c6e88b1225..f749517563 100644 --- a/chart/Chart.yaml +++ b/chart/Chart.yaml @@ -5,7 +5,7 @@ type: application icon: https://webrecorder.net/assets/icon.png # Browsertrix and Chart Version -version: v1.14.0-beta.1 +version: v1.14.0-beta.5 dependencies: - name: btrix-admin-logging diff --git a/chart/app-templates/background_job.yaml b/chart/app-templates/background_job.yaml index 8c02f21091..b26c723b94 100644 --- a/chart/app-templates/background_job.yaml +++ b/chart/app-templates/background_job.yaml @@ -5,11 +5,16 @@ metadata: labels: role: "background-job" job_type: {{ job_type }} +{% if oid %} btrix.org: {{ oid }} +{% endif %} spec: ttlSecondsAfterFinished: 90 backoffLimit: 3 + {% if scale %} + parallelism: {{ scale }} + {% endif %} template: spec: restartPolicy: Never @@ -35,12 +40,18 @@ spec: - name: BG_JOB_TYPE value: {{ job_type }} +{% if oid %} - name: OID value: {{ oid }} - +{% endif %} - name: CRAWL_TYPE value: {{ crawl_type }} +{% if crawl_id %} + - name: CRAWL_ID + value: {{ crawl_id }} +{% endif %} + envFrom: - configMapRef: name: backend-env-config diff --git a/chart/email-templates/failed_bg_job b/chart/email-templates/failed_bg_job index 27e1136314..528cfc33f3 100644 --- a/chart/email-templates/failed_bg_job +++ b/chart/email-templates/failed_bg_job @@ -2,8 +2,9 @@ Failed Background Job ~~~ Failed Background Job --------------------- - +{% if org %} Organization: {{ org.name }} ({{ job.oid }}) +{% endif %} Job type: {{ job.type }} Job ID: {{ job.id }} diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index eec66c3385..9fd4188e8b 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -39,6 +39,7 @@ data: IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}" RERUN_FROM_MIGRATION: "{{ .Values.rerun_from_migration }}" + MIGRATION_JOBS_SCALE: "{{ .Values.migration_jobs_scale | default 1 }}" PRESIGN_DURATION_MINUTES: "{{ .Values.storage_presign_duration_minutes }}" diff --git a/chart/values.yaml b/chart/values.yaml index 69a710f9c6..554bf05f6a 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -103,7 +103,7 @@ replica_deletion_delay_days: 0 # API Image # ========================================= -backend_image: "docker.io/webrecorder/browsertrix-backend:1.14.0-beta.1" +backend_image: "docker.io/webrecorder/browsertrix-backend:1.14.0-beta.5" backend_pull_policy: "Always" backend_password_secret: "PASSWORD!" @@ -111,6 +111,9 @@ backend_password_secret: "PASSWORD!" # number of workers per pod backend_workers: 1 +# for gunicorn --timeout +backend_worker_timeout: 60 + backend_cpu: "100m" backend_memory: "350Mi" @@ -158,7 +161,7 @@ backend_avg_memory_threshold: 95 # Nginx Image # ========================================= -frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.14.0-beta.1" +frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.14.0-beta.5" frontend_pull_policy: "Always" frontend_cpu: "10m" @@ -464,7 +467,6 @@ ingress_class: nginx # This runs as a blocking script on the frontend, so usually you'll want to have it just add a single script tag to the page with the `defer` attribute. Useful for things like analytics and bug tracking. # inject_extra: // your front-end injected script - # Signing Options # ========================================= # optionally enable signer @@ -481,6 +483,19 @@ signer_cpu: "5m" signer_memory: "50Mi" +# Migration Options (Advanced) +# ========================================= + +# enable to force rerun from specific migration +# see backend/btrixcloud/migrations/ for list of available migrations +# rerun_from_migration: + +# scale for certain migration background jobs +# migration_jobs_scale: 1 + +# Other Settings +# ========================================= + # Optional: configure load balancing annotations # service: # annotations: diff --git a/frontend/docs/docs/deploy/admin/upgrade-notes.md b/frontend/docs/docs/deploy/admin/upgrade-notes.md new file mode 100644 index 0000000000..21a422dc29 --- /dev/null +++ b/frontend/docs/docs/deploy/admin/upgrade-notes.md @@ -0,0 +1,19 @@ +# Upgrade Notes + +Some Browsertrix releases include long-running data migrations that may need to be monitored. This guide covers important information for such releases. + +## Browsertrix 1.14 + +Browsertrix 1.14, which introduces public collections, has several data migrations which affect crawl and upload objects as well as their pages. + +Migration 0042 in particular annotates all crawl pages in the database with information which is used to optimize loading times for crawl and collection replay. Because it must iterate through all crawl pages, this process can take a long time in deployments with many crawls and pages. + +In order to keep this optimization from blocking deployment, migration 0042 starts a parallelized background job that migrates the important data. + +If this background job fail for any reason, the superadmin will receive a background job failure notification. The status of the background job can also be checked or retried at any time using superadmin-only background job API endpoints as needed: + +- List all background jobs: `GET /orgs/all/jobs` +- Get background job: `GET /orgs/all/jobs/{job_id}` +- Retry background job: `POST /orgs/all/jobs/{job_id}/retry` + +For more details on these and other available API endpoints, consult the [Browsertrix API documentation](/api). diff --git a/frontend/docs/mkdocs.yml b/frontend/docs/mkdocs.yml index 008d9a49a0..5ee2a33173 100644 --- a/frontend/docs/mkdocs.yml +++ b/frontend/docs/mkdocs.yml @@ -83,6 +83,7 @@ nav: - deploy/ansible/microk8s.md - deploy/ansible/k3s.md - Administration: + - deploy/admin/upgrade-notes.md - deploy/admin/org-import-export.md - Development: - develop/index.md diff --git a/frontend/package.json b/frontend/package.json index cac6f29c3c..0dfaaa4901 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-frontend", - "version": "1.14.0-beta.1", + "version": "1.14.0-beta.5", "main": "index.ts", "license": "AGPL-3.0-or-later", "dependencies": { diff --git a/frontend/src/features/collections/collection-edit-dialog.ts b/frontend/src/features/collections/collection-edit-dialog.ts index 13350e832e..fed4f33913 100644 --- a/frontend/src/features/collections/collection-edit-dialog.ts +++ b/frontend/src/features/collections/collection-edit-dialog.ts @@ -215,101 +215,106 @@ export class CollectionEdit extends BtrixElement { if (this.dirty) e.preventDefault(); }} class="h-full [--width:var(--btrix-screen-desktop)]" - > - ${this.collection + >${this.isDialogVisible ? html` -