Skip to content

Commit 12f358b

Browse files
authored
Merge pull request #2271 from webrecorder/public-collections-feature
feat: Public collections, includes: - feat: Public org profile page #2172 - feat: Collection thumbnails, start page, and public view updates #2209 - feat: Track collection events #2256
2 parents bab5345 + 56a634e commit 12f358b

File tree

113 files changed

+6600
-1148
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

113 files changed

+6600
-1148
lines changed

backend/btrixcloud/background_jobs.py

Lines changed: 62 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
DeleteReplicaJob,
2323
DeleteOrgJob,
2424
RecalculateOrgStatsJob,
25+
ReAddOrgPagesJob,
2526
PaginatedBackgroundJobResponse,
2627
AnyJob,
2728
StorageRef,
@@ -301,8 +302,6 @@ async def create_delete_org_job(
301302
try:
302303
job_id = await self.crawl_manager.run_delete_org_job(
303304
oid=str(org.id),
304-
backend_image=os.environ.get("BACKEND_IMAGE", ""),
305-
pull_policy=os.environ.get("BACKEND_IMAGE_PULL_POLICY", ""),
306305
existing_job_id=existing_job_id,
307306
)
308307
if existing_job_id:
@@ -346,8 +345,6 @@ async def create_recalculate_org_stats_job(
346345
try:
347346
job_id = await self.crawl_manager.run_recalculate_org_stats_job(
348347
oid=str(org.id),
349-
backend_image=os.environ.get("BACKEND_IMAGE", ""),
350-
pull_policy=os.environ.get("BACKEND_IMAGE_PULL_POLICY", ""),
351348
existing_job_id=existing_job_id,
352349
)
353350
if existing_job_id:
@@ -381,6 +378,52 @@ async def create_recalculate_org_stats_job(
381378
print(f"warning: recalculate org stats job could not be started: {exc}")
382379
return None
383380

381+
async def create_re_add_org_pages_job(
382+
self,
383+
oid: UUID,
384+
crawl_type: Optional[str] = None,
385+
existing_job_id: Optional[str] = None,
386+
):
387+
"""Create job to (re)add all pages in an org, optionally filtered by crawl type"""
388+
389+
try:
390+
job_id = await self.crawl_manager.run_re_add_org_pages_job(
391+
oid=str(oid),
392+
crawl_type=crawl_type,
393+
existing_job_id=existing_job_id,
394+
)
395+
if existing_job_id:
396+
readd_pages_job = await self.get_background_job(existing_job_id, oid)
397+
previous_attempt = {
398+
"started": readd_pages_job.started,
399+
"finished": readd_pages_job.finished,
400+
}
401+
if readd_pages_job.previousAttempts:
402+
readd_pages_job.previousAttempts.append(previous_attempt)
403+
else:
404+
readd_pages_job.previousAttempts = [previous_attempt]
405+
readd_pages_job.started = dt_now()
406+
readd_pages_job.finished = None
407+
readd_pages_job.success = None
408+
else:
409+
readd_pages_job = ReAddOrgPagesJob(
410+
id=job_id,
411+
oid=oid,
412+
crawl_type=crawl_type,
413+
started=dt_now(),
414+
)
415+
416+
await self.jobs.find_one_and_update(
417+
{"_id": job_id}, {"$set": readd_pages_job.to_dict()}, upsert=True
418+
)
419+
420+
return job_id
421+
# pylint: disable=broad-exception-caught
422+
except Exception as exc:
423+
# pylint: disable=raise-missing-from
424+
print(f"warning: re-add org pages job could not be started: {exc}")
425+
return None
426+
384427
async def job_finished(
385428
self,
386429
job_id: str,
@@ -430,7 +473,11 @@ async def job_finished(
430473
async def get_background_job(
431474
self, job_id: str, oid: Optional[UUID] = None
432475
) -> Union[
433-
CreateReplicaJob, DeleteReplicaJob, DeleteOrgJob, RecalculateOrgStatsJob
476+
CreateReplicaJob,
477+
DeleteReplicaJob,
478+
DeleteOrgJob,
479+
RecalculateOrgStatsJob,
480+
ReAddOrgPagesJob,
434481
]:
435482
"""Get background job"""
436483
query: dict[str, object] = {"_id": job_id}
@@ -454,6 +501,9 @@ def _get_job_by_type_from_data(self, data: dict[str, object]):
454501
if data["type"] == BgJobType.RECALCULATE_ORG_STATS:
455502
return RecalculateOrgStatsJob.from_dict(data)
456503

504+
if data["type"] == BgJobType.READD_ORG_PAGES:
505+
return ReAddOrgPagesJob.from_dict(data)
506+
457507
return DeleteOrgJob.from_dict(data)
458508

459509
async def list_background_jobs(
@@ -595,6 +645,13 @@ async def retry_background_job(
595645
existing_job_id=job_id,
596646
)
597647

648+
if job.type == BgJobType.READD_ORG_PAGES:
649+
await self.create_re_add_org_pages_job(
650+
org.id,
651+
job.crawl_type,
652+
existing_job_id=job_id,
653+
)
654+
598655
return {"success": True}
599656

600657
async def retry_failed_background_jobs(

backend/btrixcloud/basecrawls.py

Lines changed: 6 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
""" base crawl type """
22

3-
import os
43
from datetime import timedelta
54
from typing import Optional, List, Union, Dict, Any, Type, TYPE_CHECKING, cast, Tuple
65
from uuid import UUID
@@ -29,6 +28,7 @@
2928
UpdatedResponse,
3029
DeletedResponseQuota,
3130
CrawlSearchValuesResponse,
31+
PRESIGN_DURATION_SECONDS,
3232
)
3333
from .pagination import paginated_format, DEFAULT_PAGE_SIZE
3434
from .utils import dt_now, date_to_str
@@ -47,11 +47,6 @@
4747
CrawlConfigOps = UserManager = OrgOps = CollectionOps = PageOps = object
4848
StorageOps = EventWebhookOps = BackgroundJobOps = object
4949

50-
# Presign duration must be less than 604800 seconds (one week),
51-
# so set this one minute short of a week.
52-
PRESIGN_MINUTES_MAX = 10079
53-
PRESIGN_MINUTES_DEFAULT = PRESIGN_MINUTES_MAX
54-
5550

5651
# ============================================================================
5752
# pylint: disable=too-many-instance-attributes, too-many-public-methods, too-many-lines
@@ -93,16 +88,8 @@ def __init__(
9388
self.background_job_ops = background_job_ops
9489
self.page_ops = cast(PageOps, None)
9590

96-
presign_duration_minutes = int(
97-
os.environ.get("PRESIGN_DURATION_MINUTES") or PRESIGN_MINUTES_DEFAULT
98-
)
99-
100-
self.presign_duration_seconds = (
101-
min(presign_duration_minutes, PRESIGN_MINUTES_MAX) * 60
102-
)
103-
10491
# renew when <25% of time remaining
105-
self.expire_at_duration_seconds = int(self.presign_duration_seconds * 0.75)
92+
self.expire_at_duration_seconds = int(PRESIGN_DURATION_SECONDS * 0.75)
10693

10794
def set_page_ops(self, page_ops):
10895
"""set page ops reference"""
@@ -336,8 +323,9 @@ async def delete_crawls(
336323
status_code=400, detail=f"Error Stopping Crawl: {exc}"
337324
)
338325

326+
await self.page_ops.delete_crawl_pages(crawl_id, org.id)
327+
339328
if type_ == "crawl":
340-
await self.page_ops.delete_crawl_pages(crawl_id, org.id)
341329
await self.delete_all_crawl_qa_files(crawl_id, org)
342330

343331
crawl_size = await self._delete_crawl_files(crawl, org)
@@ -382,7 +370,7 @@ async def _delete_crawl_files(
382370
size = 0
383371
for file_ in crawl.files:
384372
size += file_.size
385-
if not await self.storage_ops.delete_crawl_file_object(org, file_):
373+
if not await self.storage_ops.delete_file_object(org, file_):
386374
raise HTTPException(status_code=400, detail="file_deletion_error")
387375
# Not replicating QA run WACZs yet
388376
if not isinstance(crawl, QARun):
@@ -474,7 +462,7 @@ async def resolve_signed_urls(
474462
):
475463
exp = now + delta
476464
presigned_url = await self.storage_ops.get_presigned_url(
477-
org, file_, self.presign_duration_seconds
465+
org, file_, PRESIGN_DURATION_SECONDS
478466
)
479467

480468
prefix = "files"

0 commit comments

Comments
 (0)