From 0babad3f9be9e7cc555208e03abed14c6d014d62 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Mon, 18 Nov 2024 15:10:21 +0100 Subject: [PATCH 001/295] move old code to .old directory --- {config => .old/config}/config.go | 0 go.mod => .old/go.mod | 0 go.sum => .old/go.sum | 0 {internal => .old/internal}/pkg/crawl/api.go | 0 .../internal}/pkg/crawl/assets.go | 0 .../internal}/pkg/crawl/capture.go | 0 .../internal}/pkg/crawl/config.go | 0 .../internal}/pkg/crawl/crawl.go | 0 .../pkg/crawl/dependencies/ytdlp/model.go | 0 .../pkg/crawl/dependencies/ytdlp/parse.go | 0 .../pkg/crawl/dependencies/ytdlp/server.go | 0 .../pkg/crawl/dependencies/ytdlp/ytdlp.go | 0 .../internal}/pkg/crawl/exclusion.go | 0 .../internal}/pkg/crawl/extractor/json.go | 0 .../pkg/crawl/extractor/json_test.go | 0 .../internal}/pkg/crawl/extractor/m3u8.go | 0 .../internal}/pkg/crawl/extractor/s3.go | 0 .../internal}/pkg/crawl/extractor/utils.go | 0 .../internal}/pkg/crawl/extractor/xml.go | 0 .../internal}/pkg/crawl/extractor/xml_test.go | 0 .../pkg/crawl/extractor/xml_test_sitemap.xml | 0 .../internal}/pkg/crawl/finish.go | 0 {internal => .old/internal}/pkg/crawl/hq.go | 0 .../internal}/pkg/crawl/http_utils.go | 0 .../internal}/pkg/crawl/link_header.go | 0 .../internal}/pkg/crawl/link_header_test.go | 0 {internal => .old/internal}/pkg/crawl/log.go | 0 .../internal}/pkg/crawl/outlinks.go | 0 .../cloudflarestream/cloudflarestream.go | 0 .../crawl/sitespecific/facebook/facebook.go | 0 .../pkg/crawl/sitespecific/ina/ina.go | 0 .../pkg/crawl/sitespecific/libsyn/libsyn.go | 0 .../pkg/crawl/sitespecific/reddit/post.go | 0 .../pkg/crawl/sitespecific/reddit/reddit.go | 0 .../crawl/sitespecific/telegram/telegram.go | 0 .../pkg/crawl/sitespecific/tiktok/tiktok.go | 0 .../sitespecific/truthsocial/truthsocial.go | 0 .../internal}/pkg/crawl/sitespecific/vk/vk.go | 0 .../pkg/crawl/sitespecific/youtube/youtube.go | 0 .../sitespecific/youtube/youtube_test.go | 0 .../sitespecific/youtube/youtube_test.html | 0 .../internal}/pkg/crawl/stats.go | 0 .../internal}/pkg/crawl/utils.go | 0 {internal => .old/internal}/pkg/crawl/warc.go | 0 .../internal}/pkg/crawl/worker.go | 0 .../internal}/pkg/crawl/worker_pool.go | 0 .../internal}/pkg/log/elasticsearch.go | 0 {internal => .old/internal}/pkg/log/file.go | 0 {internal => .old/internal}/pkg/log/log.go | 0 {internal => .old/internal}/pkg/log/misc.go | 0 .../internal}/pkg/log/multi_handler.go | 0 {internal => .old/internal}/pkg/log/rotate.go | 0 .../internal}/pkg/log/withfields.go | 0 {internal => .old/internal}/pkg/log/writer.go | 0 .../internal}/pkg/queue/access.go | 0 .../internal}/pkg/queue/access_test.go | 0 .../internal}/pkg/queue/dequeue.go | 0 .../internal}/pkg/queue/dequeue_test.go | 0 .../internal}/pkg/queue/encoding.go | 0 .../internal}/pkg/queue/encoding_test.go | 0 .../internal}/pkg/queue/enqueue.go | 0 .../internal}/pkg/queue/enqueue_test.go | 0 .../internal}/pkg/queue/error.go | 0 .../internal}/pkg/queue/handover.go | 0 .../internal}/pkg/queue/handover_test.go | 0 .../internal}/pkg/queue/index/encoding.go | 0 .../internal}/pkg/queue/index/error.go | 0 .../internal}/pkg/queue/index/file_io.go | 0 .../internal}/pkg/queue/index/index.go | 0 .../internal}/pkg/queue/index/manager.go | 0 .../internal}/pkg/queue/index/manager_test.go | 0 .../internal}/pkg/queue/index/recovery.go | 0 .../pkg/queue/index/recovery_test.go | 0 .../internal}/pkg/queue/index/wal.go | 0 .../internal}/pkg/queue/index/wal_test.go | 0 {internal => .old/internal}/pkg/queue/item.go | 0 .../internal}/pkg/queue/item_test.go | 0 .../internal}/pkg/queue/metadata.go | 0 .../pkg/queue/protobuf/v1/item.pb.go | 0 .../pkg/queue/protobuf/v1/item.proto | 0 .../internal}/pkg/queue/queue.go | 0 .../internal}/pkg/queue/queue_test.go | 0 .../internal}/pkg/queue/stats.go | 0 .../internal}/pkg/seencheck/seencheck.go | 0 .../internal}/pkg/utils/atom_bool.go | 0 .../internal}/pkg/utils/atom_bool_test.go | 0 {internal => .old/internal}/pkg/utils/bool.go | 0 {internal => .old/internal}/pkg/utils/disk.go | 0 .../internal}/pkg/utils/files.go | 0 .../internal}/pkg/utils/files_test.go | 0 {internal => .old/internal}/pkg/utils/ip.go | 0 .../internal}/pkg/utils/strings.go | 0 {internal => .old/internal}/pkg/utils/url.go | 0 .../internal}/pkg/utils/url_test.go | 0 .../internal}/pkg/utils/version.go | 0 .old/main.go | 23 +++++++++++++++++++ main_test.go => .old/main_test.go | 0 main.go | 23 ------------------- 98 files changed, 23 insertions(+), 23 deletions(-) rename {config => .old/config}/config.go (100%) rename go.mod => .old/go.mod (100%) rename go.sum => .old/go.sum (100%) rename {internal => .old/internal}/pkg/crawl/api.go (100%) rename {internal => .old/internal}/pkg/crawl/assets.go (100%) rename {internal => .old/internal}/pkg/crawl/capture.go (100%) rename {internal => .old/internal}/pkg/crawl/config.go (100%) rename {internal => .old/internal}/pkg/crawl/crawl.go (100%) rename {internal => .old/internal}/pkg/crawl/dependencies/ytdlp/model.go (100%) rename {internal => .old/internal}/pkg/crawl/dependencies/ytdlp/parse.go (100%) rename {internal => .old/internal}/pkg/crawl/dependencies/ytdlp/server.go (100%) rename {internal => .old/internal}/pkg/crawl/dependencies/ytdlp/ytdlp.go (100%) rename {internal => .old/internal}/pkg/crawl/exclusion.go (100%) rename {internal => .old/internal}/pkg/crawl/extractor/json.go (100%) rename {internal => .old/internal}/pkg/crawl/extractor/json_test.go (100%) rename {internal => .old/internal}/pkg/crawl/extractor/m3u8.go (100%) rename {internal => .old/internal}/pkg/crawl/extractor/s3.go (100%) rename {internal => .old/internal}/pkg/crawl/extractor/utils.go (100%) rename {internal => .old/internal}/pkg/crawl/extractor/xml.go (100%) rename {internal => .old/internal}/pkg/crawl/extractor/xml_test.go (100%) rename {internal => .old/internal}/pkg/crawl/extractor/xml_test_sitemap.xml (100%) rename {internal => .old/internal}/pkg/crawl/finish.go (100%) rename {internal => .old/internal}/pkg/crawl/hq.go (100%) rename {internal => .old/internal}/pkg/crawl/http_utils.go (100%) rename {internal => .old/internal}/pkg/crawl/link_header.go (100%) rename {internal => .old/internal}/pkg/crawl/link_header_test.go (100%) rename {internal => .old/internal}/pkg/crawl/log.go (100%) rename {internal => .old/internal}/pkg/crawl/outlinks.go (100%) rename {internal => .old/internal}/pkg/crawl/sitespecific/cloudflarestream/cloudflarestream.go (100%) rename {internal => .old/internal}/pkg/crawl/sitespecific/facebook/facebook.go (100%) rename {internal => .old/internal}/pkg/crawl/sitespecific/ina/ina.go (100%) rename {internal => .old/internal}/pkg/crawl/sitespecific/libsyn/libsyn.go (100%) rename {internal => .old/internal}/pkg/crawl/sitespecific/reddit/post.go (100%) rename {internal => .old/internal}/pkg/crawl/sitespecific/reddit/reddit.go (100%) rename {internal => .old/internal}/pkg/crawl/sitespecific/telegram/telegram.go (100%) rename {internal => .old/internal}/pkg/crawl/sitespecific/tiktok/tiktok.go (100%) rename {internal => .old/internal}/pkg/crawl/sitespecific/truthsocial/truthsocial.go (100%) rename {internal => .old/internal}/pkg/crawl/sitespecific/vk/vk.go (100%) rename {internal => .old/internal}/pkg/crawl/sitespecific/youtube/youtube.go (100%) rename {internal => .old/internal}/pkg/crawl/sitespecific/youtube/youtube_test.go (100%) rename {internal => .old/internal}/pkg/crawl/sitespecific/youtube/youtube_test.html (100%) rename {internal => .old/internal}/pkg/crawl/stats.go (100%) rename {internal => .old/internal}/pkg/crawl/utils.go (100%) rename {internal => .old/internal}/pkg/crawl/warc.go (100%) rename {internal => .old/internal}/pkg/crawl/worker.go (100%) rename {internal => .old/internal}/pkg/crawl/worker_pool.go (100%) rename {internal => .old/internal}/pkg/log/elasticsearch.go (100%) rename {internal => .old/internal}/pkg/log/file.go (100%) rename {internal => .old/internal}/pkg/log/log.go (100%) rename {internal => .old/internal}/pkg/log/misc.go (100%) rename {internal => .old/internal}/pkg/log/multi_handler.go (100%) rename {internal => .old/internal}/pkg/log/rotate.go (100%) rename {internal => .old/internal}/pkg/log/withfields.go (100%) rename {internal => .old/internal}/pkg/log/writer.go (100%) rename {internal => .old/internal}/pkg/queue/access.go (100%) rename {internal => .old/internal}/pkg/queue/access_test.go (100%) rename {internal => .old/internal}/pkg/queue/dequeue.go (100%) rename {internal => .old/internal}/pkg/queue/dequeue_test.go (100%) rename {internal => .old/internal}/pkg/queue/encoding.go (100%) rename {internal => .old/internal}/pkg/queue/encoding_test.go (100%) rename {internal => .old/internal}/pkg/queue/enqueue.go (100%) rename {internal => .old/internal}/pkg/queue/enqueue_test.go (100%) rename {internal => .old/internal}/pkg/queue/error.go (100%) rename {internal => .old/internal}/pkg/queue/handover.go (100%) rename {internal => .old/internal}/pkg/queue/handover_test.go (100%) rename {internal => .old/internal}/pkg/queue/index/encoding.go (100%) rename {internal => .old/internal}/pkg/queue/index/error.go (100%) rename {internal => .old/internal}/pkg/queue/index/file_io.go (100%) rename {internal => .old/internal}/pkg/queue/index/index.go (100%) rename {internal => .old/internal}/pkg/queue/index/manager.go (100%) rename {internal => .old/internal}/pkg/queue/index/manager_test.go (100%) rename {internal => .old/internal}/pkg/queue/index/recovery.go (100%) rename {internal => .old/internal}/pkg/queue/index/recovery_test.go (100%) rename {internal => .old/internal}/pkg/queue/index/wal.go (100%) rename {internal => .old/internal}/pkg/queue/index/wal_test.go (100%) rename {internal => .old/internal}/pkg/queue/item.go (100%) rename {internal => .old/internal}/pkg/queue/item_test.go (100%) rename {internal => .old/internal}/pkg/queue/metadata.go (100%) rename {internal => .old/internal}/pkg/queue/protobuf/v1/item.pb.go (100%) rename {internal => .old/internal}/pkg/queue/protobuf/v1/item.proto (100%) rename {internal => .old/internal}/pkg/queue/queue.go (100%) rename {internal => .old/internal}/pkg/queue/queue_test.go (100%) rename {internal => .old/internal}/pkg/queue/stats.go (100%) rename {internal => .old/internal}/pkg/seencheck/seencheck.go (100%) rename {internal => .old/internal}/pkg/utils/atom_bool.go (100%) rename {internal => .old/internal}/pkg/utils/atom_bool_test.go (100%) rename {internal => .old/internal}/pkg/utils/bool.go (100%) rename {internal => .old/internal}/pkg/utils/disk.go (100%) rename {internal => .old/internal}/pkg/utils/files.go (100%) rename {internal => .old/internal}/pkg/utils/files_test.go (100%) rename {internal => .old/internal}/pkg/utils/ip.go (100%) rename {internal => .old/internal}/pkg/utils/strings.go (100%) rename {internal => .old/internal}/pkg/utils/url.go (100%) rename {internal => .old/internal}/pkg/utils/url_test.go (100%) rename {internal => .old/internal}/pkg/utils/version.go (100%) create mode 100644 .old/main.go rename main_test.go => .old/main_test.go (100%) diff --git a/config/config.go b/.old/config/config.go similarity index 100% rename from config/config.go rename to .old/config/config.go diff --git a/go.mod b/.old/go.mod similarity index 100% rename from go.mod rename to .old/go.mod diff --git a/go.sum b/.old/go.sum similarity index 100% rename from go.sum rename to .old/go.sum diff --git a/internal/pkg/crawl/api.go b/.old/internal/pkg/crawl/api.go similarity index 100% rename from internal/pkg/crawl/api.go rename to .old/internal/pkg/crawl/api.go diff --git a/internal/pkg/crawl/assets.go b/.old/internal/pkg/crawl/assets.go similarity index 100% rename from internal/pkg/crawl/assets.go rename to .old/internal/pkg/crawl/assets.go diff --git a/internal/pkg/crawl/capture.go b/.old/internal/pkg/crawl/capture.go similarity index 100% rename from internal/pkg/crawl/capture.go rename to .old/internal/pkg/crawl/capture.go diff --git a/internal/pkg/crawl/config.go b/.old/internal/pkg/crawl/config.go similarity index 100% rename from internal/pkg/crawl/config.go rename to .old/internal/pkg/crawl/config.go diff --git a/internal/pkg/crawl/crawl.go b/.old/internal/pkg/crawl/crawl.go similarity index 100% rename from internal/pkg/crawl/crawl.go rename to .old/internal/pkg/crawl/crawl.go diff --git a/internal/pkg/crawl/dependencies/ytdlp/model.go b/.old/internal/pkg/crawl/dependencies/ytdlp/model.go similarity index 100% rename from internal/pkg/crawl/dependencies/ytdlp/model.go rename to .old/internal/pkg/crawl/dependencies/ytdlp/model.go diff --git a/internal/pkg/crawl/dependencies/ytdlp/parse.go b/.old/internal/pkg/crawl/dependencies/ytdlp/parse.go similarity index 100% rename from internal/pkg/crawl/dependencies/ytdlp/parse.go rename to .old/internal/pkg/crawl/dependencies/ytdlp/parse.go diff --git a/internal/pkg/crawl/dependencies/ytdlp/server.go b/.old/internal/pkg/crawl/dependencies/ytdlp/server.go similarity index 100% rename from internal/pkg/crawl/dependencies/ytdlp/server.go rename to .old/internal/pkg/crawl/dependencies/ytdlp/server.go diff --git a/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go b/.old/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go similarity index 100% rename from internal/pkg/crawl/dependencies/ytdlp/ytdlp.go rename to .old/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go diff --git a/internal/pkg/crawl/exclusion.go b/.old/internal/pkg/crawl/exclusion.go similarity index 100% rename from internal/pkg/crawl/exclusion.go rename to .old/internal/pkg/crawl/exclusion.go diff --git a/internal/pkg/crawl/extractor/json.go b/.old/internal/pkg/crawl/extractor/json.go similarity index 100% rename from internal/pkg/crawl/extractor/json.go rename to .old/internal/pkg/crawl/extractor/json.go diff --git a/internal/pkg/crawl/extractor/json_test.go b/.old/internal/pkg/crawl/extractor/json_test.go similarity index 100% rename from internal/pkg/crawl/extractor/json_test.go rename to .old/internal/pkg/crawl/extractor/json_test.go diff --git a/internal/pkg/crawl/extractor/m3u8.go b/.old/internal/pkg/crawl/extractor/m3u8.go similarity index 100% rename from internal/pkg/crawl/extractor/m3u8.go rename to .old/internal/pkg/crawl/extractor/m3u8.go diff --git a/internal/pkg/crawl/extractor/s3.go b/.old/internal/pkg/crawl/extractor/s3.go similarity index 100% rename from internal/pkg/crawl/extractor/s3.go rename to .old/internal/pkg/crawl/extractor/s3.go diff --git a/internal/pkg/crawl/extractor/utils.go b/.old/internal/pkg/crawl/extractor/utils.go similarity index 100% rename from internal/pkg/crawl/extractor/utils.go rename to .old/internal/pkg/crawl/extractor/utils.go diff --git a/internal/pkg/crawl/extractor/xml.go b/.old/internal/pkg/crawl/extractor/xml.go similarity index 100% rename from internal/pkg/crawl/extractor/xml.go rename to .old/internal/pkg/crawl/extractor/xml.go diff --git a/internal/pkg/crawl/extractor/xml_test.go b/.old/internal/pkg/crawl/extractor/xml_test.go similarity index 100% rename from internal/pkg/crawl/extractor/xml_test.go rename to .old/internal/pkg/crawl/extractor/xml_test.go diff --git a/internal/pkg/crawl/extractor/xml_test_sitemap.xml b/.old/internal/pkg/crawl/extractor/xml_test_sitemap.xml similarity index 100% rename from internal/pkg/crawl/extractor/xml_test_sitemap.xml rename to .old/internal/pkg/crawl/extractor/xml_test_sitemap.xml diff --git a/internal/pkg/crawl/finish.go b/.old/internal/pkg/crawl/finish.go similarity index 100% rename from internal/pkg/crawl/finish.go rename to .old/internal/pkg/crawl/finish.go diff --git a/internal/pkg/crawl/hq.go b/.old/internal/pkg/crawl/hq.go similarity index 100% rename from internal/pkg/crawl/hq.go rename to .old/internal/pkg/crawl/hq.go diff --git a/internal/pkg/crawl/http_utils.go b/.old/internal/pkg/crawl/http_utils.go similarity index 100% rename from internal/pkg/crawl/http_utils.go rename to .old/internal/pkg/crawl/http_utils.go diff --git a/internal/pkg/crawl/link_header.go b/.old/internal/pkg/crawl/link_header.go similarity index 100% rename from internal/pkg/crawl/link_header.go rename to .old/internal/pkg/crawl/link_header.go diff --git a/internal/pkg/crawl/link_header_test.go b/.old/internal/pkg/crawl/link_header_test.go similarity index 100% rename from internal/pkg/crawl/link_header_test.go rename to .old/internal/pkg/crawl/link_header_test.go diff --git a/internal/pkg/crawl/log.go b/.old/internal/pkg/crawl/log.go similarity index 100% rename from internal/pkg/crawl/log.go rename to .old/internal/pkg/crawl/log.go diff --git a/internal/pkg/crawl/outlinks.go b/.old/internal/pkg/crawl/outlinks.go similarity index 100% rename from internal/pkg/crawl/outlinks.go rename to .old/internal/pkg/crawl/outlinks.go diff --git a/internal/pkg/crawl/sitespecific/cloudflarestream/cloudflarestream.go b/.old/internal/pkg/crawl/sitespecific/cloudflarestream/cloudflarestream.go similarity index 100% rename from internal/pkg/crawl/sitespecific/cloudflarestream/cloudflarestream.go rename to .old/internal/pkg/crawl/sitespecific/cloudflarestream/cloudflarestream.go diff --git a/internal/pkg/crawl/sitespecific/facebook/facebook.go b/.old/internal/pkg/crawl/sitespecific/facebook/facebook.go similarity index 100% rename from internal/pkg/crawl/sitespecific/facebook/facebook.go rename to .old/internal/pkg/crawl/sitespecific/facebook/facebook.go diff --git a/internal/pkg/crawl/sitespecific/ina/ina.go b/.old/internal/pkg/crawl/sitespecific/ina/ina.go similarity index 100% rename from internal/pkg/crawl/sitespecific/ina/ina.go rename to .old/internal/pkg/crawl/sitespecific/ina/ina.go diff --git a/internal/pkg/crawl/sitespecific/libsyn/libsyn.go b/.old/internal/pkg/crawl/sitespecific/libsyn/libsyn.go similarity index 100% rename from internal/pkg/crawl/sitespecific/libsyn/libsyn.go rename to .old/internal/pkg/crawl/sitespecific/libsyn/libsyn.go diff --git a/internal/pkg/crawl/sitespecific/reddit/post.go b/.old/internal/pkg/crawl/sitespecific/reddit/post.go similarity index 100% rename from internal/pkg/crawl/sitespecific/reddit/post.go rename to .old/internal/pkg/crawl/sitespecific/reddit/post.go diff --git a/internal/pkg/crawl/sitespecific/reddit/reddit.go b/.old/internal/pkg/crawl/sitespecific/reddit/reddit.go similarity index 100% rename from internal/pkg/crawl/sitespecific/reddit/reddit.go rename to .old/internal/pkg/crawl/sitespecific/reddit/reddit.go diff --git a/internal/pkg/crawl/sitespecific/telegram/telegram.go b/.old/internal/pkg/crawl/sitespecific/telegram/telegram.go similarity index 100% rename from internal/pkg/crawl/sitespecific/telegram/telegram.go rename to .old/internal/pkg/crawl/sitespecific/telegram/telegram.go diff --git a/internal/pkg/crawl/sitespecific/tiktok/tiktok.go b/.old/internal/pkg/crawl/sitespecific/tiktok/tiktok.go similarity index 100% rename from internal/pkg/crawl/sitespecific/tiktok/tiktok.go rename to .old/internal/pkg/crawl/sitespecific/tiktok/tiktok.go diff --git a/internal/pkg/crawl/sitespecific/truthsocial/truthsocial.go b/.old/internal/pkg/crawl/sitespecific/truthsocial/truthsocial.go similarity index 100% rename from internal/pkg/crawl/sitespecific/truthsocial/truthsocial.go rename to .old/internal/pkg/crawl/sitespecific/truthsocial/truthsocial.go diff --git a/internal/pkg/crawl/sitespecific/vk/vk.go b/.old/internal/pkg/crawl/sitespecific/vk/vk.go similarity index 100% rename from internal/pkg/crawl/sitespecific/vk/vk.go rename to .old/internal/pkg/crawl/sitespecific/vk/vk.go diff --git a/internal/pkg/crawl/sitespecific/youtube/youtube.go b/.old/internal/pkg/crawl/sitespecific/youtube/youtube.go similarity index 100% rename from internal/pkg/crawl/sitespecific/youtube/youtube.go rename to .old/internal/pkg/crawl/sitespecific/youtube/youtube.go diff --git a/internal/pkg/crawl/sitespecific/youtube/youtube_test.go b/.old/internal/pkg/crawl/sitespecific/youtube/youtube_test.go similarity index 100% rename from internal/pkg/crawl/sitespecific/youtube/youtube_test.go rename to .old/internal/pkg/crawl/sitespecific/youtube/youtube_test.go diff --git a/internal/pkg/crawl/sitespecific/youtube/youtube_test.html b/.old/internal/pkg/crawl/sitespecific/youtube/youtube_test.html similarity index 100% rename from internal/pkg/crawl/sitespecific/youtube/youtube_test.html rename to .old/internal/pkg/crawl/sitespecific/youtube/youtube_test.html diff --git a/internal/pkg/crawl/stats.go b/.old/internal/pkg/crawl/stats.go similarity index 100% rename from internal/pkg/crawl/stats.go rename to .old/internal/pkg/crawl/stats.go diff --git a/internal/pkg/crawl/utils.go b/.old/internal/pkg/crawl/utils.go similarity index 100% rename from internal/pkg/crawl/utils.go rename to .old/internal/pkg/crawl/utils.go diff --git a/internal/pkg/crawl/warc.go b/.old/internal/pkg/crawl/warc.go similarity index 100% rename from internal/pkg/crawl/warc.go rename to .old/internal/pkg/crawl/warc.go diff --git a/internal/pkg/crawl/worker.go b/.old/internal/pkg/crawl/worker.go similarity index 100% rename from internal/pkg/crawl/worker.go rename to .old/internal/pkg/crawl/worker.go diff --git a/internal/pkg/crawl/worker_pool.go b/.old/internal/pkg/crawl/worker_pool.go similarity index 100% rename from internal/pkg/crawl/worker_pool.go rename to .old/internal/pkg/crawl/worker_pool.go diff --git a/internal/pkg/log/elasticsearch.go b/.old/internal/pkg/log/elasticsearch.go similarity index 100% rename from internal/pkg/log/elasticsearch.go rename to .old/internal/pkg/log/elasticsearch.go diff --git a/internal/pkg/log/file.go b/.old/internal/pkg/log/file.go similarity index 100% rename from internal/pkg/log/file.go rename to .old/internal/pkg/log/file.go diff --git a/internal/pkg/log/log.go b/.old/internal/pkg/log/log.go similarity index 100% rename from internal/pkg/log/log.go rename to .old/internal/pkg/log/log.go diff --git a/internal/pkg/log/misc.go b/.old/internal/pkg/log/misc.go similarity index 100% rename from internal/pkg/log/misc.go rename to .old/internal/pkg/log/misc.go diff --git a/internal/pkg/log/multi_handler.go b/.old/internal/pkg/log/multi_handler.go similarity index 100% rename from internal/pkg/log/multi_handler.go rename to .old/internal/pkg/log/multi_handler.go diff --git a/internal/pkg/log/rotate.go b/.old/internal/pkg/log/rotate.go similarity index 100% rename from internal/pkg/log/rotate.go rename to .old/internal/pkg/log/rotate.go diff --git a/internal/pkg/log/withfields.go b/.old/internal/pkg/log/withfields.go similarity index 100% rename from internal/pkg/log/withfields.go rename to .old/internal/pkg/log/withfields.go diff --git a/internal/pkg/log/writer.go b/.old/internal/pkg/log/writer.go similarity index 100% rename from internal/pkg/log/writer.go rename to .old/internal/pkg/log/writer.go diff --git a/internal/pkg/queue/access.go b/.old/internal/pkg/queue/access.go similarity index 100% rename from internal/pkg/queue/access.go rename to .old/internal/pkg/queue/access.go diff --git a/internal/pkg/queue/access_test.go b/.old/internal/pkg/queue/access_test.go similarity index 100% rename from internal/pkg/queue/access_test.go rename to .old/internal/pkg/queue/access_test.go diff --git a/internal/pkg/queue/dequeue.go b/.old/internal/pkg/queue/dequeue.go similarity index 100% rename from internal/pkg/queue/dequeue.go rename to .old/internal/pkg/queue/dequeue.go diff --git a/internal/pkg/queue/dequeue_test.go b/.old/internal/pkg/queue/dequeue_test.go similarity index 100% rename from internal/pkg/queue/dequeue_test.go rename to .old/internal/pkg/queue/dequeue_test.go diff --git a/internal/pkg/queue/encoding.go b/.old/internal/pkg/queue/encoding.go similarity index 100% rename from internal/pkg/queue/encoding.go rename to .old/internal/pkg/queue/encoding.go diff --git a/internal/pkg/queue/encoding_test.go b/.old/internal/pkg/queue/encoding_test.go similarity index 100% rename from internal/pkg/queue/encoding_test.go rename to .old/internal/pkg/queue/encoding_test.go diff --git a/internal/pkg/queue/enqueue.go b/.old/internal/pkg/queue/enqueue.go similarity index 100% rename from internal/pkg/queue/enqueue.go rename to .old/internal/pkg/queue/enqueue.go diff --git a/internal/pkg/queue/enqueue_test.go b/.old/internal/pkg/queue/enqueue_test.go similarity index 100% rename from internal/pkg/queue/enqueue_test.go rename to .old/internal/pkg/queue/enqueue_test.go diff --git a/internal/pkg/queue/error.go b/.old/internal/pkg/queue/error.go similarity index 100% rename from internal/pkg/queue/error.go rename to .old/internal/pkg/queue/error.go diff --git a/internal/pkg/queue/handover.go b/.old/internal/pkg/queue/handover.go similarity index 100% rename from internal/pkg/queue/handover.go rename to .old/internal/pkg/queue/handover.go diff --git a/internal/pkg/queue/handover_test.go b/.old/internal/pkg/queue/handover_test.go similarity index 100% rename from internal/pkg/queue/handover_test.go rename to .old/internal/pkg/queue/handover_test.go diff --git a/internal/pkg/queue/index/encoding.go b/.old/internal/pkg/queue/index/encoding.go similarity index 100% rename from internal/pkg/queue/index/encoding.go rename to .old/internal/pkg/queue/index/encoding.go diff --git a/internal/pkg/queue/index/error.go b/.old/internal/pkg/queue/index/error.go similarity index 100% rename from internal/pkg/queue/index/error.go rename to .old/internal/pkg/queue/index/error.go diff --git a/internal/pkg/queue/index/file_io.go b/.old/internal/pkg/queue/index/file_io.go similarity index 100% rename from internal/pkg/queue/index/file_io.go rename to .old/internal/pkg/queue/index/file_io.go diff --git a/internal/pkg/queue/index/index.go b/.old/internal/pkg/queue/index/index.go similarity index 100% rename from internal/pkg/queue/index/index.go rename to .old/internal/pkg/queue/index/index.go diff --git a/internal/pkg/queue/index/manager.go b/.old/internal/pkg/queue/index/manager.go similarity index 100% rename from internal/pkg/queue/index/manager.go rename to .old/internal/pkg/queue/index/manager.go diff --git a/internal/pkg/queue/index/manager_test.go b/.old/internal/pkg/queue/index/manager_test.go similarity index 100% rename from internal/pkg/queue/index/manager_test.go rename to .old/internal/pkg/queue/index/manager_test.go diff --git a/internal/pkg/queue/index/recovery.go b/.old/internal/pkg/queue/index/recovery.go similarity index 100% rename from internal/pkg/queue/index/recovery.go rename to .old/internal/pkg/queue/index/recovery.go diff --git a/internal/pkg/queue/index/recovery_test.go b/.old/internal/pkg/queue/index/recovery_test.go similarity index 100% rename from internal/pkg/queue/index/recovery_test.go rename to .old/internal/pkg/queue/index/recovery_test.go diff --git a/internal/pkg/queue/index/wal.go b/.old/internal/pkg/queue/index/wal.go similarity index 100% rename from internal/pkg/queue/index/wal.go rename to .old/internal/pkg/queue/index/wal.go diff --git a/internal/pkg/queue/index/wal_test.go b/.old/internal/pkg/queue/index/wal_test.go similarity index 100% rename from internal/pkg/queue/index/wal_test.go rename to .old/internal/pkg/queue/index/wal_test.go diff --git a/internal/pkg/queue/item.go b/.old/internal/pkg/queue/item.go similarity index 100% rename from internal/pkg/queue/item.go rename to .old/internal/pkg/queue/item.go diff --git a/internal/pkg/queue/item_test.go b/.old/internal/pkg/queue/item_test.go similarity index 100% rename from internal/pkg/queue/item_test.go rename to .old/internal/pkg/queue/item_test.go diff --git a/internal/pkg/queue/metadata.go b/.old/internal/pkg/queue/metadata.go similarity index 100% rename from internal/pkg/queue/metadata.go rename to .old/internal/pkg/queue/metadata.go diff --git a/internal/pkg/queue/protobuf/v1/item.pb.go b/.old/internal/pkg/queue/protobuf/v1/item.pb.go similarity index 100% rename from internal/pkg/queue/protobuf/v1/item.pb.go rename to .old/internal/pkg/queue/protobuf/v1/item.pb.go diff --git a/internal/pkg/queue/protobuf/v1/item.proto b/.old/internal/pkg/queue/protobuf/v1/item.proto similarity index 100% rename from internal/pkg/queue/protobuf/v1/item.proto rename to .old/internal/pkg/queue/protobuf/v1/item.proto diff --git a/internal/pkg/queue/queue.go b/.old/internal/pkg/queue/queue.go similarity index 100% rename from internal/pkg/queue/queue.go rename to .old/internal/pkg/queue/queue.go diff --git a/internal/pkg/queue/queue_test.go b/.old/internal/pkg/queue/queue_test.go similarity index 100% rename from internal/pkg/queue/queue_test.go rename to .old/internal/pkg/queue/queue_test.go diff --git a/internal/pkg/queue/stats.go b/.old/internal/pkg/queue/stats.go similarity index 100% rename from internal/pkg/queue/stats.go rename to .old/internal/pkg/queue/stats.go diff --git a/internal/pkg/seencheck/seencheck.go b/.old/internal/pkg/seencheck/seencheck.go similarity index 100% rename from internal/pkg/seencheck/seencheck.go rename to .old/internal/pkg/seencheck/seencheck.go diff --git a/internal/pkg/utils/atom_bool.go b/.old/internal/pkg/utils/atom_bool.go similarity index 100% rename from internal/pkg/utils/atom_bool.go rename to .old/internal/pkg/utils/atom_bool.go diff --git a/internal/pkg/utils/atom_bool_test.go b/.old/internal/pkg/utils/atom_bool_test.go similarity index 100% rename from internal/pkg/utils/atom_bool_test.go rename to .old/internal/pkg/utils/atom_bool_test.go diff --git a/internal/pkg/utils/bool.go b/.old/internal/pkg/utils/bool.go similarity index 100% rename from internal/pkg/utils/bool.go rename to .old/internal/pkg/utils/bool.go diff --git a/internal/pkg/utils/disk.go b/.old/internal/pkg/utils/disk.go similarity index 100% rename from internal/pkg/utils/disk.go rename to .old/internal/pkg/utils/disk.go diff --git a/internal/pkg/utils/files.go b/.old/internal/pkg/utils/files.go similarity index 100% rename from internal/pkg/utils/files.go rename to .old/internal/pkg/utils/files.go diff --git a/internal/pkg/utils/files_test.go b/.old/internal/pkg/utils/files_test.go similarity index 100% rename from internal/pkg/utils/files_test.go rename to .old/internal/pkg/utils/files_test.go diff --git a/internal/pkg/utils/ip.go b/.old/internal/pkg/utils/ip.go similarity index 100% rename from internal/pkg/utils/ip.go rename to .old/internal/pkg/utils/ip.go diff --git a/internal/pkg/utils/strings.go b/.old/internal/pkg/utils/strings.go similarity index 100% rename from internal/pkg/utils/strings.go rename to .old/internal/pkg/utils/strings.go diff --git a/internal/pkg/utils/url.go b/.old/internal/pkg/utils/url.go similarity index 100% rename from internal/pkg/utils/url.go rename to .old/internal/pkg/utils/url.go diff --git a/internal/pkg/utils/url_test.go b/.old/internal/pkg/utils/url_test.go similarity index 100% rename from internal/pkg/utils/url_test.go rename to .old/internal/pkg/utils/url_test.go diff --git a/internal/pkg/utils/version.go b/.old/internal/pkg/utils/version.go similarity index 100% rename from internal/pkg/utils/version.go rename to .old/internal/pkg/utils/version.go diff --git a/.old/main.go b/.old/main.go new file mode 100644 index 00000000..b616c731 --- /dev/null +++ b/.old/main.go @@ -0,0 +1,23 @@ +// Zeno is a web crawler designed to operate wide crawls or to simply archive one web page. +// Zeno's key concepts are: portability, performance, simplicity ; with an emphasis on performance. + +// Authors: +// +// Corentin Barreau +// Jake LaFountain +// Thomas Foubert +package main + +import ( + "fmt" + "os" + + "github.com/internetarchive/Zeno/cmd" +) + +func main() { + if err := cmd.Run(); err != nil { + fmt.Println(err) + os.Exit(1) + } +} diff --git a/main_test.go b/.old/main_test.go similarity index 100% rename from main_test.go rename to .old/main_test.go diff --git a/main.go b/main.go index b616c731..e69de29b 100644 --- a/main.go +++ b/main.go @@ -1,23 +0,0 @@ -// Zeno is a web crawler designed to operate wide crawls or to simply archive one web page. -// Zeno's key concepts are: portability, performance, simplicity ; with an emphasis on performance. - -// Authors: -// -// Corentin Barreau -// Jake LaFountain -// Thomas Foubert -package main - -import ( - "fmt" - "os" - - "github.com/internetarchive/Zeno/cmd" -) - -func main() { - if err := cmd.Run(); err != nil { - fmt.Println(err) - os.Exit(1) - } -} From 0de7f65035b18afe0113491eaeea406256426ccf Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Mon, 18 Nov 2024 17:41:25 +0100 Subject: [PATCH 002/295] first draft of reactor --- {cmd => .old/cmd}/cmd.go | 0 {cmd => .old/cmd}/get.go | 0 {cmd => .old/cmd}/get_hq.go | 0 {cmd => .old/cmd}/get_list.go | 0 {cmd => .old/cmd}/get_url.go | 0 go.mod | 15 ++++ go.sum | 13 ++++ internal/pkg/reactor/error.go | 17 +++++ internal/pkg/reactor/reactor.go | 131 ++++++++++++++++++++++++++++++++ main.go | 82 ++++++++++++++++++++ pkg/models/seed.go | 46 +++++++++++ 11 files changed, 304 insertions(+) rename {cmd => .old/cmd}/cmd.go (100%) rename {cmd => .old/cmd}/get.go (100%) rename {cmd => .old/cmd}/get_hq.go (100%) rename {cmd => .old/cmd}/get_list.go (100%) rename {cmd => .old/cmd}/get_url.go (100%) create mode 100644 go.mod create mode 100644 go.sum create mode 100644 internal/pkg/reactor/error.go create mode 100644 internal/pkg/reactor/reactor.go create mode 100644 pkg/models/seed.go diff --git a/cmd/cmd.go b/.old/cmd/cmd.go similarity index 100% rename from cmd/cmd.go rename to .old/cmd/cmd.go diff --git a/cmd/get.go b/.old/cmd/get.go similarity index 100% rename from cmd/get.go rename to .old/cmd/get.go diff --git a/cmd/get_hq.go b/.old/cmd/get_hq.go similarity index 100% rename from cmd/get_hq.go rename to .old/cmd/get_hq.go diff --git a/cmd/get_list.go b/.old/cmd/get_list.go similarity index 100% rename from cmd/get_list.go rename to .old/cmd/get_list.go diff --git a/cmd/get_url.go b/.old/cmd/get_url.go similarity index 100% rename from cmd/get_url.go rename to .old/cmd/get_url.go diff --git a/go.mod b/go.mod new file mode 100644 index 00000000..4516a8ac --- /dev/null +++ b/go.mod @@ -0,0 +1,15 @@ +module github.com/internetarchive/Zeno + +go 1.23.3 + +require ( + github.com/google/uuid v1.6.0 + github.com/internetarchive/gocrawlhq v1.2.20 +) + +require ( + github.com/gobwas/httphead v0.1.0 // indirect + github.com/gobwas/pool v0.2.1 // indirect + github.com/gobwas/ws v1.4.0 // indirect + golang.org/x/sys v0.26.0 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 00000000..71839d02 --- /dev/null +++ b/go.sum @@ -0,0 +1,13 @@ +github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU= +github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM= +github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og= +github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw= +github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs= +github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/internetarchive/gocrawlhq v1.2.20 h1:0mIIt9lhPacKr6L2JeISoopQ8EgzC3dISJ3ITGGbOp4= +github.com/internetarchive/gocrawlhq v1.2.20/go.mod h1:gHrdMewIi5OBWE/xEZGqSrNHyTXPbt+h+XUWpp9fZek= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= +golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= diff --git a/internal/pkg/reactor/error.go b/internal/pkg/reactor/error.go new file mode 100644 index 00000000..e51abae1 --- /dev/null +++ b/internal/pkg/reactor/error.go @@ -0,0 +1,17 @@ +package reactor + +import "errors" + +var ( + // ErrReactorAlreadyInitialized is the error returned when the reactor is already initialized + ErrReactorAlreadyInitialized = errors.New("reactor already initialized") + // ErrReactorNotInitialized is the error returned when the reactor is not initialized + ErrReactorNotInitialized = errors.New("reactor not initialized") + // ErrReactorShuttingDown is the error returned when the reactor is shutting down + ErrReactorShuttingDown = errors.New("reactor shutting down") + + // ErrFeedbackItemNotPresent is the error returned when an item was sent to the feedback channel but not found in the state table + ErrFeedbackItemNotPresent = errors.New("feedback item not present in state table") + // ErrFinisehdItemNotFound is the error returned when an item been marked as finished but not found in the state table + ErrFinisehdItemNotFound = errors.New("markAsFinished item not present in state table") +) diff --git a/internal/pkg/reactor/reactor.go b/internal/pkg/reactor/reactor.go new file mode 100644 index 00000000..4ebeb0a9 --- /dev/null +++ b/internal/pkg/reactor/reactor.go @@ -0,0 +1,131 @@ +// Package reactor provides functionality to manage and control the processing of seeds. +package reactor + +import ( + "context" + "fmt" + "sync" + + "github.com/internetarchive/Zeno/pkg/models" +) + +// reactor struct holds the state and channels for managing seeds processing. +type reactor struct { + tokenPool chan struct{} // Token pool to control asset count + ctx context.Context // Context for stopping the reactor + cancelFunc context.CancelFunc + input chan *models.Seed // Combined input channel for source and feedback + output chan *models.Seed // Output channel + stateTable sync.Map // State table for tracking seeds by UUID + wg sync.WaitGroup // WaitGroup to manage goroutines +} + +var ( + globalReactor *reactor + once sync.Once +) + +// Start initializes the global reactor with the given maximum tokens. +// This method can only be called once. +func Start(maxTokens int, outputChan chan *models.Seed) error { + var done bool + + once.Do(func() { + ctx, cancel := context.WithCancel(context.Background()) + globalReactor = &reactor{ + tokenPool: make(chan struct{}, maxTokens), + ctx: ctx, + cancelFunc: cancel, + input: make(chan *models.Seed), + output: outputChan, + } + globalReactor.wg.Add(1) + go globalReactor.run() + fmt.Println("Reactor started") + done = true + }) + + if !done { + return ErrReactorAlreadyInitialized + } + + return nil +} + +// Stop stops the global reactor and waits for all goroutines to finish. +func Stop() { + if globalReactor != nil { + globalReactor.cancelFunc() + globalReactor.wg.Wait() + close(globalReactor.output) + fmt.Println("Reactor stopped") + } +} + +// ReceiveFeedback sends an item to the feedback channel. +func ReceiveFeedback(item *models.Seed) error { + if globalReactor == nil { + return ErrReactorNotInitialized + } + + select { + case globalReactor.input <- item: + item.Source = models.SeedSourceFeedback + _, loaded := globalReactor.stateTable.Swap(item.UUID, item) + if !loaded { + // An item sent to the feedback channel should be present on the state table, if not present reactor should error out + return ErrFeedbackItemNotPresent + } + return nil + case <-globalReactor.ctx.Done(): + return ErrReactorShuttingDown + } +} + +// ReceiveSource sends an item to the source seeds channel. +func ReceiveSource(item *models.Seed) error { + if globalReactor == nil { + return ErrReactorNotInitialized + } + + select { + case globalReactor.tokenPool <- struct{}{}: + globalReactor.input <- item + globalReactor.stateTable.Store(item.UUID, item) + return nil + case <-globalReactor.ctx.Done(): + return ErrReactorShuttingDown + } +} + +// MarkAsFinished marks an item as finished and releases a token if found in the state table. +func MarkAsFinished(item *models.Seed) error { + if globalReactor == nil { + return ErrReactorNotInitialized + } + + if _, loaded := globalReactor.stateTable.LoadAndDelete(item.UUID); loaded { + <-globalReactor.tokenPool + return nil + } + return ErrFinisehdItemNotFound +} + +func (r *reactor) run() { + defer r.wg.Done() + + for { + select { + // Closes the run routine when context is canceled + case <-r.ctx.Done(): + fmt.Println("Reactor shutting down...") + return + + // Feeds items to the output channel + case item, ok := <-r.input: + if ok { + r.output <- item + } + } + } +} diff --git a/main.go b/main.go index e69de29b..c41448e4 100644 --- a/main.go +++ b/main.go @@ -0,0 +1,82 @@ +package main + +import ( + "fmt" + "time" + + "github.com/google/uuid" + "github.com/internetarchive/Zeno/internal/pkg/reactor" + "github.com/internetarchive/Zeno/pkg/models" + "github.com/internetarchive/gocrawlhq" +) + +func main() { + // Initialize the reactor with a maximum of 5 tokens + outputChan := make(chan *models.Seed) + err := reactor.Start(5, outputChan) + if err != nil { + fmt.Println("Error starting reactor:", err) + return + } + defer reactor.Stop() + + // Consume items from the output channel, start 5 goroutines + for i := 0; i < 5; i++ { + go func() { + for { + select { + case item := <-outputChan: + fmt.Println("Consumed item from output channel:", item.URL.Value, item.Source) + + // Send feedback for the consumed item + if item.Source != models.SeedSourceFeedback { + err := reactor.ReceiveFeedback(item) + if err != nil { + fmt.Println("Error sending feedback:", err) + } + continue + } + + // Mark the item as finished + if item.Source == models.SeedSourceFeedback { + err := reactor.MarkAsFinished(item) + if err != nil { + fmt.Println("Error marking item as finished:", err) + } + fmt.Println("Marked item as finished:", item.URL.Value) + continue + } + } + } + }() + } + + // Create mock seeds + uuid1 := uuid.New() + uuid2 := uuid.New() + mockSeeds := []*models.Seed{ + { + UUID: &uuid1, + URL: &gocrawlhq.URL{Value: "http://example.com/1"}, + Status: models.SeedFresh, + Source: models.SeedSourceQueue, + }, + { + UUID: &uuid2, + URL: &gocrawlhq.URL{Value: "http://example.com/2"}, + Status: models.SeedFresh, + Source: models.SeedSourceQueue, + }, + } + + // Queue mock seeds to the source channel + for _, seed := range mockSeeds { + err := reactor.ReceiveSource(seed) + if err != nil { + fmt.Println("Error queuing seed to source channel:", err) + } + } + + // Allow some time for processing + time.Sleep(10 * time.Second) +} diff --git a/pkg/models/seed.go b/pkg/models/seed.go new file mode 100644 index 00000000..7c412d0b --- /dev/null +++ b/pkg/models/seed.go @@ -0,0 +1,46 @@ +package models + +import ( + "github.com/google/uuid" + "github.com/internetarchive/gocrawlhq" +) + +// Seed represents a URL, it's assets and it's state in the pipeline +type Seed struct { + UUID *uuid.UUID // UUID is the unique identifier of the seed + URL *gocrawlhq.URL // URL is the URL of the seed + Status SeedState // Status is the state of the seed in the pipeline + Source SeedSource // Source is the source of the seed in the pipeline + AssetsCaptured bool // AssetsCaptured is the flag to indicate if the assets of the seed has been captured + Assets []*gocrawlhq.URL // Assets is the list of assets of the seed +} + +// SeedState qualifies the state of a seed in the pipeline +type SeedState int + +const ( + // SeedFresh is the initial state of a seed either it's from HQ, the Queue or Feedback + SeedFresh SeedState = iota + // SeedPreProcessed is the state after the seed has been pre-processed + SeedPreProcessed + // SeedCaptured is the state after the seed has been captured + SeedCaptured + // SeedPostProcessed is the state after the seed has been post-processed + SeedPostProcessed + // SeedFailed is the state after the seed has failed + SeedFailed + // SeedCompleted is the state after the seed has been completed + SeedCompleted +) + +// SeedSource qualifies the source of a seed in the pipeline +type SeedSource int + +const ( + // SeedSourceQueue is for seeds that are from the Queue + SeedSourceQueue SeedSource = iota + // SeedSourceHQ is for seeds that are from the HQ + SeedSourceHQ + // SeedSourceFeedback is for seeds that are from the Feedback + SeedSourceFeedback +) From a45c07d7303327aceb5a2dafc935410ce9958d3b Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Mon, 18 Nov 2024 18:29:08 +0100 Subject: [PATCH 003/295] working reactor --- internal/pkg/reactor/reactor.go | 16 ++++++++-------- internal/pkg/reactor/state.go | 11 +++++++++++ main.go | 31 +++++++++++++------------------ 3 files changed, 32 insertions(+), 26 deletions(-) create mode 100644 internal/pkg/reactor/state.go diff --git a/internal/pkg/reactor/reactor.go b/internal/pkg/reactor/reactor.go index 4ebeb0a9..8f1376ae 100644 --- a/internal/pkg/reactor/reactor.go +++ b/internal/pkg/reactor/reactor.go @@ -68,14 +68,14 @@ func ReceiveFeedback(item *models.Seed) error { return ErrReactorNotInitialized } + item.Source = models.SeedSourceFeedback + _, loaded := globalReactor.stateTable.Swap(item.UUID.String(), item) + if !loaded { + // An item sent to the feedback channel should be present on the state table, if not present reactor should error out + return ErrFeedbackItemNotPresent + } select { case globalReactor.input <- item: - item.Source = models.SeedSourceFeedback - _, loaded := globalReactor.stateTable.Swap(item.UUID, item) - if !loaded { - // An item sent to the feedback channel should be present on the state table, if not present reactor should error out - return ErrFeedbackItemNotPresent - } return nil case <-globalReactor.ctx.Done(): return ErrReactorShuttingDown @@ -90,8 +90,8 @@ func ReceiveSource(item *models.Seed) error { select { case globalReactor.tokenPool <- struct{}{}: + globalReactor.stateTable.Store(item.UUID.String(), item) globalReactor.input <- item - globalReactor.stateTable.Store(item.UUID, item) return nil case <-globalReactor.ctx.Done(): return ErrReactorShuttingDown @@ -104,7 +104,7 @@ func MarkAsFinished(item *models.Seed) error { return ErrReactorNotInitialized } - if _, loaded := globalReactor.stateTable.LoadAndDelete(item.UUID); loaded { + if _, loaded := globalReactor.stateTable.LoadAndDelete(item.UUID.String()); loaded { <-globalReactor.tokenPool return nil } diff --git a/internal/pkg/reactor/state.go b/internal/pkg/reactor/state.go new file mode 100644 index 00000000..c3e4e7fd --- /dev/null +++ b/internal/pkg/reactor/state.go @@ -0,0 +1,11 @@ +package reactor + +// GetStateTable returns a slice of all the seeds UUIDs as string in the state table. +func GetStateTable() []string { + keys := []string{} + globalReactor.stateTable.Range(func(key, _ interface{}) bool { + keys = append(keys, key.(string)) + return true + }) + return keys +} diff --git a/main.go b/main.go index c41448e4..58fcb882 100644 --- a/main.go +++ b/main.go @@ -13,7 +13,7 @@ import ( func main() { // Initialize the reactor with a maximum of 5 tokens outputChan := make(chan *models.Seed) - err := reactor.Start(5, outputChan) + err := reactor.Start(100, outputChan) if err != nil { fmt.Println("Error starting reactor:", err) return @@ -21,7 +21,7 @@ func main() { defer reactor.Stop() // Consume items from the output channel, start 5 goroutines - for i := 0; i < 5; i++ { + for i := 0; i < 100; i++ { go func() { for { select { @@ -32,7 +32,7 @@ func main() { if item.Source != models.SeedSourceFeedback { err := reactor.ReceiveFeedback(item) if err != nil { - fmt.Println("Error sending feedback:", err) + fmt.Println("Error sending feedback:", err, item.UUID.String()) } continue } @@ -52,21 +52,15 @@ func main() { } // Create mock seeds - uuid1 := uuid.New() - uuid2 := uuid.New() - mockSeeds := []*models.Seed{ - { - UUID: &uuid1, - URL: &gocrawlhq.URL{Value: "http://example.com/1"}, + mockSeeds := []*models.Seed{} + for i := 0; i <= 1000; i++ { + uuid := uuid.New() + mockSeeds = append(mockSeeds, &models.Seed{ + UUID: &uuid, + URL: &gocrawlhq.URL{Value: fmt.Sprintf("http://example.com/%d", i)}, Status: models.SeedFresh, - Source: models.SeedSourceQueue, - }, - { - UUID: &uuid2, - URL: &gocrawlhq.URL{Value: "http://example.com/2"}, - Status: models.SeedFresh, - Source: models.SeedSourceQueue, - }, + Source: models.SeedSourceHQ, + }) } // Queue mock seeds to the source channel @@ -78,5 +72,6 @@ func main() { } // Allow some time for processing - time.Sleep(10 * time.Second) + time.Sleep(5 * time.Second) + fmt.Println("State table:", reactor.GetStateTable()) } From c6a4657d1c328d1290bb823cf5960332656e7d85 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Mon, 18 Nov 2024 18:33:56 +0100 Subject: [PATCH 004/295] buffer the input channel to prevent deadlocks if there's less workers than tokens --- internal/pkg/reactor/reactor.go | 2 +- main.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/pkg/reactor/reactor.go b/internal/pkg/reactor/reactor.go index 8f1376ae..0ebd6e97 100644 --- a/internal/pkg/reactor/reactor.go +++ b/internal/pkg/reactor/reactor.go @@ -36,7 +36,7 @@ func Start(maxTokens int, outputChan chan *models.Seed) error { tokenPool: make(chan struct{}, maxTokens), ctx: ctx, cancelFunc: cancel, - input: make(chan *models.Seed), + input: make(chan *models.Seed, maxTokens), output: outputChan, } globalReactor.wg.Add(1) diff --git a/main.go b/main.go index 58fcb882..74d537f5 100644 --- a/main.go +++ b/main.go @@ -13,7 +13,7 @@ import ( func main() { // Initialize the reactor with a maximum of 5 tokens outputChan := make(chan *models.Seed) - err := reactor.Start(100, outputChan) + err := reactor.Start(10, outputChan) if err != nil { fmt.Println("Error starting reactor:", err) return @@ -21,7 +21,7 @@ func main() { defer reactor.Stop() // Consume items from the output channel, start 5 goroutines - for i := 0; i < 100; i++ { + for i := 0; i < 5; i++ { go func() { for { select { From 5a1b75135eda6aadc316b7f9255efbc458ddd6a9 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Mon, 18 Nov 2024 20:31:44 +0100 Subject: [PATCH 005/295] move test main to proper test file and empty main file --- internal/pkg/reactor/reactor_test.go | 76 ++++++++++++++++++++++++++++ main.go | 76 +--------------------------- 2 files changed, 77 insertions(+), 75 deletions(-) create mode 100644 internal/pkg/reactor/reactor_test.go diff --git a/internal/pkg/reactor/reactor_test.go b/internal/pkg/reactor/reactor_test.go new file mode 100644 index 00000000..694dc03d --- /dev/null +++ b/internal/pkg/reactor/reactor_test.go @@ -0,0 +1,76 @@ +package reactor + +import ( + "fmt" + "testing" + "time" + + "github.com/google/uuid" + "github.com/internetarchive/Zeno/pkg/models" + "github.com/internetarchive/gocrawlhq" +) + +func TestReactorE2E(t *testing.T) { + // Initialize the reactor with a maximum of 5 tokens + outputChan := make(chan *models.Seed) + err := Start(5, outputChan) + if err != nil { + t.Logf("Error starting reactor: %s", err) + return + } + defer Stop() + + // Consume items from the output channel, start 5 goroutines + for i := 0; i < 5; i++ { + go func(t *testing.T) { + for { + select { + case item := <-outputChan: + // Send feedback for the consumed item + if item.Source != models.SeedSourceFeedback { + err := ReceiveFeedback(item) + if err != nil { + t.Fatalf("Error sending feedback: %s - %s", err, item.UUID.String()) + } + continue + } + + // Mark the item as finished + if item.Source == models.SeedSourceFeedback { + err := MarkAsFinished(item) + if err != nil { + t.Fatalf("Error marking item as finished: %s", err) + } + continue + } + } + } + }(t) + } + + // Create mock seeds + mockSeeds := []*models.Seed{} + for i := 0; i <= 1000; i++ { + uuid := uuid.New() + mockSeeds = append(mockSeeds, &models.Seed{ + UUID: &uuid, + URL: &gocrawlhq.URL{Value: fmt.Sprintf("http://example.com/%d", i)}, + Status: models.SeedFresh, + Source: models.SeedSourceHQ, + }) + } + + // Queue mock seeds to the source channel + for _, seed := range mockSeeds { + err := ReceiveSource(seed) + if err != nil { + t.Fatalf("Error queuing seed to source channel: %s", err) + } + } + + // Allow some time for processing + time.Sleep(5 * time.Second) + if len(GetStateTable()) > 0 { + t.Fatalf("State table is not empty: %s", GetStateTable()) + } +} diff --git a/main.go b/main.go index 74d537f5..38dd16da 100644 --- a/main.go +++ b/main.go @@ -1,77 +1,3 @@ package main -import ( - "fmt" - "time" - - "github.com/google/uuid" - "github.com/internetarchive/Zeno/internal/pkg/reactor" - "github.com/internetarchive/Zeno/pkg/models" - "github.com/internetarchive/gocrawlhq" -) - -func main() { - // Initialize the reactor with a maximum of 5 tokens - outputChan := make(chan *models.Seed) - err := reactor.Start(10, outputChan) - if err != nil { - fmt.Println("Error starting reactor:", err) - return - } - defer reactor.Stop() - - // Consume items from the output channel, start 5 goroutines - for i := 0; i < 5; i++ { - go func() { - for { - select { - case item := <-outputChan: - fmt.Println("Consumed item from output channel:", item.URL.Value, item.Source) - - // Send feedback for the consumed item - if item.Source != models.SeedSourceFeedback { - err := reactor.ReceiveFeedback(item) - if err != nil { - fmt.Println("Error sending feedback:", err, item.UUID.String()) - } - continue - } - - // Mark the item as finished - if item.Source == models.SeedSourceFeedback { - err := reactor.MarkAsFinished(item) - if err != nil { - fmt.Println("Error marking item as finished:", err) - } - fmt.Println("Marked item as finished:", item.URL.Value) - continue - } - } - } - }() - } - - // Create mock seeds - mockSeeds := []*models.Seed{} - for i := 0; i <= 1000; i++ { - uuid := uuid.New() - mockSeeds = append(mockSeeds, &models.Seed{ - UUID: &uuid, - URL: &gocrawlhq.URL{Value: fmt.Sprintf("http://example.com/%d", i)}, - Status: models.SeedFresh, - Source: models.SeedSourceHQ, - }) - } - - // Queue mock seeds to the source channel - for _, seed := range mockSeeds { - err := reactor.ReceiveSource(seed) - if err != nil { - fmt.Println("Error queuing seed to source channel:", err) - } - } - - // Allow some time for processing - time.Sleep(5 * time.Second) - fmt.Println("State table:", reactor.GetStateTable()) -} +func main() {} From 1531858006b5507a625812bb659dd6514ad1edaa Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Mon, 18 Nov 2024 20:51:48 +0100 Subject: [PATCH 006/295] re-import many utils & cmd --- .old/cmd/get_list.go | 61 ---- .old/cmd/get_url.go | 70 ----- .old/main.go | 23 -- {.old/cmd => cmd}/cmd.go | 2 +- {.old/cmd => cmd}/get.go | 2 - {.old/cmd => cmd}/get_hq.go | 23 +- go.mod | 38 ++- go.sum | 92 ++++++ {.old => internal/pkg}/config/config.go | 282 +++++++++++++++++- .../utils => internal/pkg/config}/version.go | 4 +- .../pkg/utils/atom_bool.go | 0 .../pkg/utils/atom_bool_test.go | 0 {.old/internal => internal}/pkg/utils/bool.go | 0 {.old/internal => internal}/pkg/utils/disk.go | 0 .../internal => internal}/pkg/utils/files.go | 0 .../pkg/utils/files_test.go | 0 {.old/internal => internal}/pkg/utils/ip.go | 7 +- .../pkg/utils/strings.go | 0 {.old/internal => internal}/pkg/utils/url.go | 0 .../pkg/utils/url_test.go | 0 internal/utils/ip.go | 37 +++ main.go | 22 +- .old/main_test.go => main_test.go | 0 23 files changed, 478 insertions(+), 185 deletions(-) delete mode 100644 .old/cmd/get_list.go delete mode 100644 .old/cmd/get_url.go delete mode 100644 .old/main.go rename {.old/cmd => cmd}/cmd.go (97%) rename {.old/cmd => cmd}/get.go (99%) rename {.old/cmd => cmd}/get_hq.go (75%) rename {.old => internal/pkg}/config/config.go (51%) rename {.old/internal/pkg/utils => internal/pkg/config}/version.go (94%) rename {.old/internal => internal}/pkg/utils/atom_bool.go (100%) rename {.old/internal => internal}/pkg/utils/atom_bool_test.go (100%) rename {.old/internal => internal}/pkg/utils/bool.go (100%) rename {.old/internal => internal}/pkg/utils/disk.go (100%) rename {.old/internal => internal}/pkg/utils/files.go (100%) rename {.old/internal => internal}/pkg/utils/files_test.go (100%) rename {.old/internal => internal}/pkg/utils/ip.go (77%) rename {.old/internal => internal}/pkg/utils/strings.go (100%) rename {.old/internal => internal}/pkg/utils/url.go (100%) rename {.old/internal => internal}/pkg/utils/url_test.go (100%) create mode 100644 internal/utils/ip.go rename .old/main_test.go => main_test.go (100%) diff --git a/.old/cmd/get_list.go b/.old/cmd/get_list.go deleted file mode 100644 index 1f935b7e..00000000 --- a/.old/cmd/get_list.go +++ /dev/null @@ -1,61 +0,0 @@ -package cmd - -import ( - "fmt" - - "github.com/internetarchive/Zeno/internal/pkg/crawl" - "github.com/internetarchive/Zeno/internal/pkg/queue" - "github.com/spf13/cobra" -) - -var getListCmd = &cobra.Command{ - Use: "list [FILE]", - Short: "Start crawling with a seed list", - Args: cobra.ExactArgs(1), - PreRunE: func(cmd *cobra.Command, args []string) error { - if cfg == nil { - return fmt.Errorf("viper config is nil") - } - return nil - }, - RunE: func(cmd *cobra.Command, args []string) error { - // Init crawl using the flags provided - crawl, err := crawl.GenerateCrawlConfig(cfg) - if err != nil { - if crawl != nil && crawl.Log != nil { - crawl.Log.WithFields(map[string]interface{}{ - "crawl": crawl, - "err": err.Error(), - }).Error("'get hq' exited due to error") - } - return err - } - - // Initialize initial seed list - crawl.SeedList, err = queue.FileToItems(args[0]) - if err != nil || len(crawl.SeedList) <= 0 { - crawl.Log.WithFields(map[string]interface{}{ - "input": args[0], - "err": err.Error(), - }).Error("This is not a valid input") - return err - } - - crawl.Log.WithFields(map[string]interface{}{ - "input": args[0], - "seedsCount": len(crawl.SeedList), - }).Info("Seed list loaded") - - // Start crawl - err = crawl.Start() - if err != nil { - crawl.Log.WithFields(map[string]interface{}{ - "crawl": crawl, - "err": err.Error(), - }).Error("Crawl exited due to error") - return err - } - - return nil - }, -} diff --git a/.old/cmd/get_url.go b/.old/cmd/get_url.go deleted file mode 100644 index 7212cd68..00000000 --- a/.old/cmd/get_url.go +++ /dev/null @@ -1,70 +0,0 @@ -package cmd - -import ( - "fmt" - "net/url" - - "github.com/internetarchive/Zeno/internal/pkg/crawl" - "github.com/internetarchive/Zeno/internal/pkg/queue" - "github.com/spf13/cobra" -) - -var getURLCmd = &cobra.Command{ - Use: "url [URL...]", - Short: "Archive given URLs", - Args: cobra.MinimumNArgs(1), - PreRunE: func(cmd *cobra.Command, args []string) error { - if cfg == nil { - return fmt.Errorf("viper config is nil") - } - return nil - }, - RunE: func(cmd *cobra.Command, args []string) error { - // Init crawl using the flags provided - crawl, err := crawl.GenerateCrawlConfig(cfg) - if err != nil { - if crawl != nil && crawl.Log != nil { - crawl.Log.WithFields(map[string]interface{}{ - "crawl": crawl, - "err": err.Error(), - }).Error("'get url' exited due to error") - } - return err - } - - // Initialize initial seed list - for _, arg := range args { - input, err := url.Parse(arg) - if err != nil { - crawl.Log.WithFields(map[string]interface{}{ - "input_url": arg, - "err": err.Error(), - }).Error("given URL is not a valid input") - return err - } - - item, err := queue.NewItem(input, nil, "seed", 0, "", false) - if err != nil { - crawl.Log.WithFields(map[string]interface{}{ - "input_url": arg, - "err": err.Error(), - }).Error("Failed to create new item") - return err - } - crawl.SeedList = append(crawl.SeedList, *item) - } - - // Start crawl - err = crawl.Start() - if err != nil { - crawl.Log.WithFields(map[string]interface{}{ - "crawl": crawl, - "err": err.Error(), - }).Error("'get url' Crawl() exited due to error") - return err - } - - crawl.Log.Info("Crawl finished") - return err - }, -} diff --git a/.old/main.go b/.old/main.go deleted file mode 100644 index b616c731..00000000 --- a/.old/main.go +++ /dev/null @@ -1,23 +0,0 @@ -// Zeno is a web crawler designed to operate wide crawls or to simply archive one web page. -// Zeno's key concepts are: portability, performance, simplicity ; with an emphasis on performance. - -// Authors: -// -// Corentin Barreau -// Jake LaFountain -// Thomas Foubert -package main - -import ( - "fmt" - "os" - - "github.com/internetarchive/Zeno/cmd" -) - -func main() { - if err := cmd.Run(); err != nil { - fmt.Println(err) - os.Exit(1) - } -} diff --git a/.old/cmd/cmd.go b/cmd/cmd.go similarity index 97% rename from .old/cmd/cmd.go rename to cmd/cmd.go index 62cd5526..c6002835 100644 --- a/.old/cmd/cmd.go +++ b/cmd/cmd.go @@ -3,7 +3,7 @@ package cmd import ( "fmt" - "github.com/internetarchive/Zeno/config" + "github.com/internetarchive/Zeno/internal/pkg/config" "github.com/spf13/cobra" ) diff --git a/.old/cmd/get.go b/cmd/get.go similarity index 99% rename from .old/cmd/get.go rename to cmd/get.go index fc273e2c..22a614a4 100644 --- a/.old/cmd/get.go +++ b/cmd/get.go @@ -18,9 +18,7 @@ func getCMDs() *cobra.Command { getCMDsFlags(getCmd) getHQCmdFlags(getHQCmd) - getCmd.AddCommand(getURLCmd) getCmd.AddCommand(getHQCmd) - getCmd.AddCommand(getListCmd) return getCmd } diff --git a/.old/cmd/get_hq.go b/cmd/get_hq.go similarity index 75% rename from .old/cmd/get_hq.go rename to cmd/get_hq.go index 3b3ad02d..065c7d9f 100644 --- a/.old/cmd/get_hq.go +++ b/cmd/get_hq.go @@ -3,7 +3,7 @@ package cmd import ( "fmt" - "github.com/internetarchive/Zeno/internal/pkg/crawl" + "github.com/internetarchive/Zeno/internal/pkg/config" "github.com/spf13/cobra" ) @@ -14,29 +14,14 @@ var getHQCmd = &cobra.Command{ if cfg == nil { return fmt.Errorf("viper config is nil") } + cfg.HQ = true + return nil }, RunE: func(cmd *cobra.Command, args []string) error { - // Init crawl using the flags provided - crawl, err := crawl.GenerateCrawlConfig(cfg) - if err != nil { - if crawl != nil && crawl.Log != nil { - crawl.Log.WithFields(map[string]interface{}{ - "crawl": crawl, - "err": err.Error(), - }).Error("'get hq' exited due to error") - } - return err - } - - // start crawl - err = crawl.Start() + _, err := config.GenerateCrawlConfig(cfg) if err != nil { - crawl.Log.WithFields(map[string]interface{}{ - "crawl": crawl, - "err": err.Error(), - }).Error("'get hq' Crawl() exited due to error") return err } diff --git a/go.mod b/go.mod index 4516a8ac..1328ec52 100644 --- a/go.mod +++ b/go.mod @@ -8,8 +8,44 @@ require ( ) require ( + github.com/CorentinB/warc v0.8.53 // indirect + github.com/andybalholm/brotli v1.1.0 // indirect + github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect + github.com/cloudflare/circl v1.4.0 // indirect + github.com/fsnotify/fsnotify v1.7.0 // indirect github.com/gobwas/httphead v0.1.0 // indirect github.com/gobwas/pool v0.2.1 // indirect github.com/gobwas/ws v1.4.0 // indirect - golang.org/x/sys v0.26.0 // indirect + github.com/hashicorp/hcl v1.0.0 // indirect + github.com/inconshreveable/mousetrap v1.1.0 // indirect + github.com/klauspost/compress v1.17.10 // indirect + github.com/magiconair/properties v1.8.7 // indirect + github.com/miekg/dns v1.1.62 // indirect + github.com/mitchellh/mapstructure v1.5.0 // indirect + github.com/paulbellamy/ratecounter v0.2.0 // indirect + github.com/pelletier/go-toml/v2 v2.2.2 // indirect + github.com/refraction-networking/utls v1.6.7 // indirect + github.com/sagikazarmark/locafero v0.4.0 // indirect + github.com/sagikazarmark/slog-shim v0.1.0 // indirect + github.com/sourcegraph/conc v0.3.0 // indirect + github.com/spf13/afero v1.11.0 // indirect + github.com/spf13/cast v1.6.0 // indirect + github.com/spf13/cobra v1.8.1 // indirect + github.com/spf13/pflag v1.0.5 // indirect + github.com/spf13/viper v1.19.0 // indirect + github.com/subosito/gotenv v1.6.0 // indirect + github.com/ulikunitz/xz v0.5.12 // indirect + go.uber.org/atomic v1.9.0 // indirect + go.uber.org/goleak v1.3.0 // indirect + go.uber.org/multierr v1.9.0 // indirect + golang.org/x/crypto v0.29.0 // indirect + golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect + golang.org/x/mod v0.21.0 // indirect + golang.org/x/net v0.31.0 // indirect + golang.org/x/sync v0.9.0 // indirect + golang.org/x/sys v0.27.0 // indirect + golang.org/x/text v0.20.0 // indirect + golang.org/x/tools v0.25.0 // indirect + gopkg.in/ini.v1 v1.67.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index 71839d02..036c85fd 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,16 @@ +github.com/CorentinB/warc v0.8.53 h1:xVz3RMdZ6faAqTtLfcK1/yl8ZTansy+B2en//EZLUlM= +github.com/CorentinB/warc v0.8.53/go.mod h1:NblONkMtoBB4TIigew6F6vakzu0z3YQTKNFS8U2FIn8= +github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= +github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY= +github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 h1:DklsrG3dyBCFEj5IhUbnKptjxatkF07cF2ak3yi77so= +github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw= +github.com/cloudflare/circl v1.4.0 h1:BV7h5MgrktNzytKmWjpOtdYrf0lkkbF8YMlBGPhJQrY= +github.com/cloudflare/circl v1.4.0/go.mod h1:PDRU+oXvdD7KCtgKxW95M5Z8BpSCJXQORiZFnBQS5QU= +github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= +github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU= github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM= github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og= @@ -6,8 +19,87 @@ github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs= github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= +github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= +github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= +github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/internetarchive/gocrawlhq v1.2.20 h1:0mIIt9lhPacKr6L2JeISoopQ8EgzC3dISJ3ITGGbOp4= github.com/internetarchive/gocrawlhq v1.2.20/go.mod h1:gHrdMewIi5OBWE/xEZGqSrNHyTXPbt+h+XUWpp9fZek= +github.com/klauspost/compress v1.17.10 h1:oXAz+Vh0PMUvJczoi+flxpnBEPxoER1IaAnU/NMPtT0= +github.com/klauspost/compress v1.17.10/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0= +github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= +github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= +github.com/miekg/dns v1.1.62 h1:cN8OuEF1/x5Rq6Np+h1epln8OiyPWV+lROx9LxcGgIQ= +github.com/miekg/dns v1.1.62/go.mod h1:mvDlcItzm+br7MToIKqkglaGhlFMHJ9DTNNWONWXbNQ= +github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= +github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= +github.com/paulbellamy/ratecounter v0.2.0 h1:2L/RhJq+HA8gBQImDXtLPrDXK5qAj6ozWVK/zFXVJGs= +github.com/paulbellamy/ratecounter v0.2.0/go.mod h1:Hfx1hDpSGoqxkVVpBi/IlYD7kChlfo5C6hzIHwPqfFE= +github.com/pelletier/go-toml/v2 v2.2.2 h1:aYUidT7k73Pcl9nb2gScu7NSrKCSHIDE89b3+6Wq+LM= +github.com/pelletier/go-toml/v2 v2.2.2/go.mod h1:1t835xjRzz80PqgE6HHgN2JOsmgYu/h4qDAS4n929Rs= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/refraction-networking/utls v1.6.7 h1:zVJ7sP1dJx/WtVuITug3qYUq034cDq9B2MR1K67ULZM= +github.com/refraction-networking/utls v1.6.7/go.mod h1:BC3O4vQzye5hqpmDTWUqi4P5DDhzJfkV1tdqtawQIH0= +github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/sagikazarmark/locafero v0.4.0 h1:HApY1R9zGo4DBgr7dqsTH/JJxLTTsOt7u6keLGt6kNQ= +github.com/sagikazarmark/locafero v0.4.0/go.mod h1:Pe1W6UlPYUk/+wc/6KFhbORCfqzgYEpgQ3O5fPuL3H4= +github.com/sagikazarmark/slog-shim v0.1.0 h1:diDBnUNK9N/354PgrxMywXnAwEr1QZcOr6gto+ugjYE= +github.com/sagikazarmark/slog-shim v0.1.0/go.mod h1:SrcSrq8aKtyuqEI1uvTDTK1arOWRIczQRv+GVI1AkeQ= +github.com/sourcegraph/conc v0.3.0 h1:OQTbbt6P72L20UqAkXXuLOj79LfEanQ+YQFNpLA9ySo= +github.com/sourcegraph/conc v0.3.0/go.mod h1:Sdozi7LEKbFPqYX2/J+iBAM6HpqSLTASQIKqDmF7Mt0= +github.com/spf13/afero v1.11.0 h1:WJQKhtpdm3v2IzqG8VMqrr6Rf3UYpEF239Jy9wNepM8= +github.com/spf13/afero v1.11.0/go.mod h1:GH9Y3pIexgf1MTIWtNGyogA5MwRIDXGUr+hbWNoBjkY= +github.com/spf13/cast v1.6.0 h1:GEiTHELF+vaR5dhz3VqZfFSzZjYbgeKDpBxQVS4GYJ0= +github.com/spf13/cast v1.6.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo= +github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= +github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= +github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= +github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/viper v1.19.0 h1:RWq5SEjt8o25SROyN3z2OrDB9l7RPd3lwTWU8EcEdcI= +github.com/spf13/viper v1.19.0/go.mod h1:GQUN9bilAbhU/jgc1bKs99f/suXKeUMct8Adx5+Ntkg= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8= +github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU= +github.com/ulikunitz/xz v0.5.12 h1:37Nm15o69RwBkXM0J6A5OlE67RZTfzUxTj8fB3dfcsc= +github.com/ulikunitz/xz v0.5.12/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14= +go.uber.org/atomic v1.9.0 h1:ECmE8Bn/WFTYwEW/bpKD3M8VtR/zQVbavAoalC1PYyE= +go.uber.org/atomic v1.9.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.uber.org/multierr v1.9.0 h1:7fIwc/ZtS0q++VgcfqFDxSBZVv/Xo49/SYnDFupUwlI= +go.uber.org/multierr v1.9.0/go.mod h1:X2jQV1h+kxSjClGpnseKVIxpmcjrj7MNnI0bnlfKTVQ= +golang.org/x/crypto v0.29.0 h1:L5SG1JTTXupVV3n6sUqMTeWbjAyfPwoda2DLX8J8FrQ= +golang.org/x/crypto v0.29.0/go.mod h1:+F4F4N5hv6v38hfeYwTdx20oUvLLc+QfrE9Ax9HtgRg= +golang.org/x/exp v0.0.0-20230905200255-921286631fa9 h1:GoHiUyI/Tp2nVkLI2mCxVkOjsbSXD66ic0XW0js0R9g= +golang.org/x/exp v0.0.0-20230905200255-921286631fa9/go.mod h1:S2oDrQGGwySpoQPVqRShND87VCbxmc6bL1Yd2oYrm6k= +golang.org/x/mod v0.21.0 h1:vvrHzRwRfVKSiLrG+d4FMl/Qi4ukBCE6kZlTUkDYRT0= +golang.org/x/mod v0.21.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= +golang.org/x/net v0.31.0 h1:68CPQngjLL0r2AlUKiSxtQFKvzRVbnzLwMUn5SzcLHo= +golang.org/x/net v0.31.0/go.mod h1:P4fl1q7dY2hnZFxEk4pPSkDHF+QqjitcnDjUQyMM+pM= +golang.org/x/sync v0.9.0 h1:fEo0HyrW1GIgZdpbhCRO0PkJajUS5H9IFUztCgEo2jQ= +golang.org/x/sync v0.9.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s= +golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug= +golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4= +golang.org/x/tools v0.25.0 h1:oFU9pkj/iJgs+0DT+VMHrx+oBKs/LJMV+Uvg78sl+fE= +golang.org/x/tools v0.25.0/go.mod h1:/vtpO8WL1N9cQC3FN5zPqb//fRXskFHbLKk4OW1Q7rg= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= +gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/.old/config/config.go b/internal/pkg/config/config.go similarity index 51% rename from .old/config/config.go rename to internal/pkg/config/config.go index 916d8fc2..09f25b6b 100644 --- a/.old/config/config.go +++ b/internal/pkg/config/config.go @@ -2,17 +2,28 @@ package config import ( "fmt" + "log" + "log/slog" + "net/http" "net/url" "os" + "path" "path/filepath" "strings" "sync" + "time" + "github.com/CorentinB/warc" + "github.com/google/uuid" + "github.com/internetarchive/Zeno/internal/pkg/utils" + "github.com/internetarchive/gocrawlhq" + "github.com/paulbellamy/ratecounter" "github.com/spf13/pflag" "github.com/spf13/viper" ) -// Config holds all configuration for our program +// Config holds all configuration for our program, parsed from various sources +// The `mapstructure` tags are used to map the fields to the viper configuration type Config struct { LogLevel string `mapstructure:"log-level"` UserAgent string `mapstructure:"user-agent"` @@ -90,6 +101,116 @@ type Config struct { YTDLPPath string `mapstructure:"ytdlp-path"` } +// Crawl define the parameters of a crawl process +type Crawl struct { + *sync.Mutex + StartTime time.Time + // SeedList []queue.Item + // Paused *utils.TAtomBool + // Finished *utils.TAtomBool + LiveStats bool + + // Logger + Log *log.Logger + + // Queue (ex-frontier) + // Queue *queue.PersistentGroupedQueue + // Seencheck *seencheck.Seencheck + UseSeencheck bool + UseHandover bool + UseCommit bool + + // Worker pool + // Workers *WorkerPool + + // Crawl settings + MaxConcurrentAssets int + Client *warc.CustomHTTPClient + ClientProxied *warc.CustomHTTPClient + DisabledHTMLTags []string + ExcludedHosts []string + IncludedHosts []string + IncludedStrings []string + ExcludedStrings []string + UserAgent string + Job string + JobPath string + MaxHops uint8 + MaxRetry uint8 + MaxRedirect uint8 + HTTPTimeout int + MaxConcurrentRequestsPerDomain int + RateLimitDelay int + CrawlTimeLimit int + MaxCrawlTimeLimit int + DisableAssetsCapture bool + CaptureAlternatePages bool + DomainsCrawl bool + Headless bool + MinSpaceRequired int + + // Cookie-related settings + CookieFile string + KeepCookies bool + CookieJar http.CookieJar + + // Network settings + Proxy string + BypassProxy []string + RandomLocalIP bool + DisableIPv4 bool + DisableIPv6 bool + IPv6AnyIP bool + + // API settings + API bool + APIPort string + Prometheus bool + // PrometheusMetrics *PrometheusMetrics + + // Real time statistics + URIsPerSecond *ratecounter.RateCounter + ActiveWorkers *ratecounter.Counter + CrawledSeeds *ratecounter.Counter + CrawledAssets *ratecounter.Counter + + // WARC settings + WARCPrefix string + WARCOperator string + WARCWriter chan *warc.RecordBatch + WARCWriterFinish chan bool + WARCTempDir string + CDXDedupeServer string + WARCFullOnDisk bool + WARCPoolSize int + WARCDedupeSize int + WARCSize int + DisableLocalDedupe bool + CertValidation bool + WARCCustomCookie string + + // Crawl HQ settings + UseHQ bool + HQAddress string + HQProject string + HQKey string + HQSecret string + HQStrategy string + HQBatchConcurrency int + HQBatchSize int + HQContinuousPull bool + HQClient *gocrawlhq.Client + HQConsumerState string + // HQFinishedChannel chan *queue.Item + // HQProducerChannel chan *queue.Item + HQChannelsWg *sync.WaitGroup + HQRateLimitingSendBack bool + + // Dependencies + NoYTDLP bool + YTDLPPath string +} + var ( config *Config once sync.Once @@ -174,6 +295,165 @@ func GetConfig() *Config { return cfg } +func GenerateCrawlConfig(config *Config) (*Crawl, error) { + var c = new(Crawl) + + // Statistics counters + c.CrawledSeeds = new(ratecounter.Counter) + c.CrawledAssets = new(ratecounter.Counter) + c.ActiveWorkers = new(ratecounter.Counter) + c.URIsPerSecond = ratecounter.NewRateCounter(1 * time.Second) + + c.LiveStats = config.LiveStats + + // If the job name isn't specified, we generate a random name + if config.Job == "" { + if config.HQProject != "" { + c.Job = config.HQProject + } else { + UUID, err := uuid.NewUUID() + if err != nil { + slog.Error("cmd/utils.go:InitCrawlWithCMD():uuid.NewUUID()", "error", err) + return nil, err + } + + c.Job = UUID.String() + } + } else { + c.Job = config.Job + } + + c.JobPath = path.Join("jobs", config.Job) + + // TODO + // c.Workers = NewPool(uint(config.WorkersCount), time.Second*60, c) + + c.UseSeencheck = !config.DisableSeencheck + c.HTTPTimeout = config.HTTPTimeout + c.MaxConcurrentRequestsPerDomain = config.MaxConcurrentRequestsPerDomain + c.RateLimitDelay = config.ConcurrentSleepLength + c.CrawlTimeLimit = config.CrawlTimeLimit + + // Defaults --max-crawl-time-limit to 10% more than --crawl-time-limit + if config.CrawlMaxTimeLimit == 0 && config.CrawlTimeLimit != 0 { + c.MaxCrawlTimeLimit = config.CrawlTimeLimit + (config.CrawlTimeLimit / 10) + } else { + c.MaxCrawlTimeLimit = config.CrawlMaxTimeLimit + } + + c.MaxRetry = config.MaxRetry + c.MaxRedirect = config.MaxRedirect + c.MaxHops = config.MaxHops + c.DomainsCrawl = config.DomainsCrawl + c.DisableAssetsCapture = config.DisableAssetsCapture + c.DisabledHTMLTags = config.DisableHTMLTag + + // We exclude some hosts by default + c.ExcludedHosts = utils.DedupeStrings(append(config.ExcludeHosts, "archive.org", "archive-it.org")) + + c.IncludedHosts = config.IncludeHosts + c.CaptureAlternatePages = config.CaptureAlternatePages + c.ExcludedStrings = config.ExcludeString + c.IncludedStrings = config.IncludeString + + c.MinSpaceRequired = config.MinSpaceRequired + + // WARC settings + c.WARCPrefix = config.WARCPrefix + c.WARCOperator = config.WARCOperator + + if config.WARCTempDir != "" { + c.WARCTempDir = config.WARCTempDir + } else { + c.WARCTempDir = path.Join(c.JobPath, "temp") + } + + c.CDXDedupeServer = config.CDXDedupeServer + c.DisableLocalDedupe = config.DisableLocalDedupe + c.CertValidation = config.CertValidation + c.WARCFullOnDisk = config.WARCOnDisk + c.WARCPoolSize = config.WARCPoolSize + c.WARCDedupeSize = config.WARCDedupeSize + c.WARCCustomCookie = config.CDXCookie + c.WARCSize = config.WARCSize + + c.API = config.API + c.APIPort = config.APIPort + + // If Prometheus is specified, then we make sure + // c.API is true + c.Prometheus = config.Prometheus + if c.Prometheus { + c.API = true + // TODO: Implement Prometheus metrics + // c.PrometheusMetrics = &PrometheusMetrics{} + // c.PrometheusMetrics.Prefix = config.PrometheusPrefix + } + + // Dependencies + c.NoYTDLP = config.NoYTDLP + c.YTDLPPath = config.YTDLPPath + + if config.UserAgent != "Zeno" { + c.UserAgent = config.UserAgent + } else { + version := getVersion() + + // If Version is a commit hash, we only take the first 7 characters + if len(version.Version) >= 40 { + version.Version = version.Version[:7] + } + + c.UserAgent = "Mozilla/5.0 (compatible; archive.org_bot +http://archive.org/details/archive.org_bot) Zeno/" + version.Version + " warc/" + version.WarcVersion + slog.Info("User-Agent set to", "user-agent", c.UserAgent) + } + c.Headless = config.Headless + + c.CookieFile = config.Cookies + c.KeepCookies = config.KeepCookies + + // Network settings + c.Proxy = config.Proxy + c.BypassProxy = config.DomainsBypassProxy + c.RandomLocalIP = config.RandomLocalIP + + if c.RandomLocalIP { + slog.Warn("Random local IP is enabled") + } + + c.DisableIPv4 = config.DisableIPv4 + c.DisableIPv6 = config.DisableIPv6 + c.IPv6AnyIP = config.IPv6AnyIP + + if c.DisableIPv4 && c.DisableIPv6 { + slog.Error("Both IPv4 and IPv6 are disabled, at least one of them must be enabled.") + os.Exit(1) + } else if c.DisableIPv4 { + slog.Info("IPv4 is disabled") + } else if c.DisableIPv6 { + slog.Info("IPv6 is disabled") + } + + // Crawl HQ settings + c.UseHQ = config.HQ + c.HQProject = config.HQProject + c.HQAddress = config.HQAddress + c.HQKey = config.HQKey + c.HQSecret = config.HQSecret + c.HQStrategy = config.HQStrategy + c.HQBatchSize = int(config.HQBatchSize) + c.HQBatchConcurrency = config.HQBatchConcurrency + c.HQContinuousPull = config.HQContinuousPull + c.HQRateLimitingSendBack = config.HQRateLimitSendBack + + // Handover mechanism + c.UseHandover = config.Handover + + c.UseCommit = !config.NoBatchWriteWAL + + return c, nil +} + func handleFlagsEdgeCases() { if viper.GetBool("live-stats") { // If live-stats is true, set no-stdout-log to true diff --git a/.old/internal/pkg/utils/version.go b/internal/pkg/config/version.go similarity index 94% rename from .old/internal/pkg/utils/version.go rename to internal/pkg/config/version.go index 89ba3a84..8dd16d1c 100644 --- a/.old/internal/pkg/utils/version.go +++ b/internal/pkg/config/version.go @@ -1,4 +1,4 @@ -package utils +package config import ( "runtime/debug" @@ -11,7 +11,7 @@ type Version struct { ZenoVersion string } -func GetVersion() (version Version) { +func getVersion() (version Version) { // Defaults to "unknown_version" version.Version = "unknown_version" diff --git a/.old/internal/pkg/utils/atom_bool.go b/internal/pkg/utils/atom_bool.go similarity index 100% rename from .old/internal/pkg/utils/atom_bool.go rename to internal/pkg/utils/atom_bool.go diff --git a/.old/internal/pkg/utils/atom_bool_test.go b/internal/pkg/utils/atom_bool_test.go similarity index 100% rename from .old/internal/pkg/utils/atom_bool_test.go rename to internal/pkg/utils/atom_bool_test.go diff --git a/.old/internal/pkg/utils/bool.go b/internal/pkg/utils/bool.go similarity index 100% rename from .old/internal/pkg/utils/bool.go rename to internal/pkg/utils/bool.go diff --git a/.old/internal/pkg/utils/disk.go b/internal/pkg/utils/disk.go similarity index 100% rename from .old/internal/pkg/utils/disk.go rename to internal/pkg/utils/disk.go diff --git a/.old/internal/pkg/utils/files.go b/internal/pkg/utils/files.go similarity index 100% rename from .old/internal/pkg/utils/files.go rename to internal/pkg/utils/files.go diff --git a/.old/internal/pkg/utils/files_test.go b/internal/pkg/utils/files_test.go similarity index 100% rename from .old/internal/pkg/utils/files_test.go rename to internal/pkg/utils/files_test.go diff --git a/.old/internal/pkg/utils/ip.go b/internal/pkg/utils/ip.go similarity index 77% rename from .old/internal/pkg/utils/ip.go rename to internal/pkg/utils/ip.go index b101a1d2..0b135a33 100644 --- a/.old/internal/pkg/utils/ip.go +++ b/internal/pkg/utils/ip.go @@ -1,10 +1,9 @@ package utils import ( + "log/slog" "net" "os" - - "github.com/sirupsen/logrus" ) // Note: GetOutboundIP does not establish any connection and the @@ -18,7 +17,7 @@ func GetOutboundIP() net.IP { for { conn, err = net.Dial("udp", "24.24.24.24:24200") if err != nil { - logrus.Errorf("error getting outbound IP, retrying: %s", err) + slog.Error("error getting outbound IP, retrying", "err", err) continue } defer conn.Close() @@ -31,7 +30,7 @@ func GetOutboundIP() net.IP { func GetHostname() string { hostname, err := os.Hostname() if err != nil { - logrus.Errorf("error getting hostname: %s", err) + slog.Error("error getting hostname", "err", err) } return hostname diff --git a/.old/internal/pkg/utils/strings.go b/internal/pkg/utils/strings.go similarity index 100% rename from .old/internal/pkg/utils/strings.go rename to internal/pkg/utils/strings.go diff --git a/.old/internal/pkg/utils/url.go b/internal/pkg/utils/url.go similarity index 100% rename from .old/internal/pkg/utils/url.go rename to internal/pkg/utils/url.go diff --git a/.old/internal/pkg/utils/url_test.go b/internal/pkg/utils/url_test.go similarity index 100% rename from .old/internal/pkg/utils/url_test.go rename to internal/pkg/utils/url_test.go diff --git a/internal/utils/ip.go b/internal/utils/ip.go new file mode 100644 index 00000000..0b135a33 --- /dev/null +++ b/internal/utils/ip.go @@ -0,0 +1,37 @@ +package utils + +import ( + "log/slog" + "net" + "os" +) + +// Note: GetOutboundIP does not establish any connection and the +// destination does not need to exist for this function to work. +func GetOutboundIP() net.IP { + var ( + conn net.Conn + err error + ) + + for { + conn, err = net.Dial("udp", "24.24.24.24:24200") + if err != nil { + slog.Error("error getting outbound IP, retrying", "err", err) + continue + } + defer conn.Close() + break + } + + return conn.LocalAddr().(*net.UDPAddr).IP +} + +func GetHostname() string { + hostname, err := os.Hostname() + if err != nil { + slog.Error("error getting hostname", "err", err) + } + + return hostname +} diff --git a/main.go b/main.go index 38dd16da..b616c731 100644 --- a/main.go +++ b/main.go @@ -1,3 +1,23 @@ +// Zeno is a web crawler designed to operate wide crawls or to simply archive one web page. +// Zeno's key concepts are: portability, performance, simplicity ; with an emphasis on performance. + +// Authors: +// +// Corentin Barreau +// Jake LaFountain +// Thomas Foubert package main -func main() {} +import ( + "fmt" + "os" + + "github.com/internetarchive/Zeno/cmd" +) + +func main() { + if err := cmd.Run(); err != nil { + fmt.Println(err) + os.Exit(1) + } +} diff --git a/.old/main_test.go b/main_test.go similarity index 100% rename from .old/main_test.go rename to main_test.go From cc60547cf7bdbfbfabef201c451bc1abdfd38c5c Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Mon, 18 Nov 2024 21:18:19 +0100 Subject: [PATCH 007/295] re-architecture cmd --- cmd/get.go | 4 +- cmd/get_hq.go | 9 +- internal/pkg/config/config.go | 386 ++++++++-------------------------- main.go | 3 + 4 files changed, 95 insertions(+), 307 deletions(-) diff --git a/cmd/get.go b/cmd/get.go index 22a614a4..65fa8295 100644 --- a/cmd/get.go +++ b/cmd/get.go @@ -24,7 +24,7 @@ func getCMDs() *cobra.Command { } func getCMDsFlags(getCmd *cobra.Command) { - getCmd.PersistentFlags().String("user-agent", "Zeno", "User agent to use when requesting URLs.") + getCmd.PersistentFlags().String("user-agent", "", "User agent to use when requesting URLs.") getCmd.PersistentFlags().String("job", "", "Job name to use, will determine the path for the persistent queue, seencheck database, and WARC files.") getCmd.PersistentFlags().IntP("workers", "w", 1, "Number of concurrent workers to run.") getCmd.PersistentFlags().Int("max-concurrent-assets", 8, "Max number of concurrent assets to fetch PER worker. E.g. if you have 100 workers and this setting at 8, Zeno could do up to 800 concurrent requests at any time.") @@ -48,8 +48,6 @@ func getCMDsFlags(getCmd *cobra.Command) { getCmd.PersistentFlags().StringSlice("exclude-host", []string{}, "Exclude a specific host from the crawl, note that it will not exclude the domain if it is encountered as an asset for another web page.") getCmd.PersistentFlags().StringSlice("include-host", []string{}, "Only crawl specific hosts, note that it will not include the domain if it is encountered as an asset for another web page.") getCmd.PersistentFlags().StringSlice("include-string", []string{}, "Only crawl URLs containing this string.") - getCmd.PersistentFlags().Int("max-concurrent-per-domain", 16, "Maximum number of concurrent requests per domain.") - getCmd.PersistentFlags().Int("concurrent-sleep-length", 500, "Number of milliseconds to sleep when max concurrency per domain is reached.") getCmd.PersistentFlags().Int("crawl-time-limit", 0, "Number of seconds until the crawl will automatically set itself into the finished state.") getCmd.PersistentFlags().Int("crawl-max-time-limit", 0, "Number of seconds until the crawl will automatically panic itself. Default to crawl-time-limit + (crawl-time-limit / 10)") getCmd.PersistentFlags().StringSlice("exclude-string", []string{}, "Discard any (discovered) URLs containing this string.") diff --git a/cmd/get_hq.go b/cmd/get_hq.go index 065c7d9f..b913ee35 100644 --- a/cmd/get_hq.go +++ b/cmd/get_hq.go @@ -19,13 +19,8 @@ var getHQCmd = &cobra.Command{ return nil }, - RunE: func(cmd *cobra.Command, args []string) error { - _, err := config.GenerateCrawlConfig(cfg) - if err != nil { - return err - } - - return nil + RunE: func(cmd *cobra.Command, args []string) (err error) { + return config.GenerateCrawlConfig() }, } diff --git a/internal/pkg/config/config.go b/internal/pkg/config/config.go index 09f25b6b..8572a780 100644 --- a/internal/pkg/config/config.go +++ b/internal/pkg/config/config.go @@ -2,22 +2,16 @@ package config import ( "fmt" - "log" "log/slog" - "net/http" "net/url" "os" "path" "path/filepath" "strings" "sync" - "time" - "github.com/CorentinB/warc" "github.com/google/uuid" "github.com/internetarchive/Zeno/internal/pkg/utils" - "github.com/internetarchive/gocrawlhq" - "github.com/paulbellamy/ratecounter" "github.com/spf13/pflag" "github.com/spf13/viper" ) @@ -25,68 +19,73 @@ import ( // Config holds all configuration for our program, parsed from various sources // The `mapstructure` tags are used to map the fields to the viper configuration type Config struct { - LogLevel string `mapstructure:"log-level"` - UserAgent string `mapstructure:"user-agent"` - Job string `mapstructure:"job"` - Cookies string `mapstructure:"cookies"` - APIPort string `mapstructure:"api-port"` - PrometheusPrefix string `mapstructure:"prometheus-prefix"` - WARCPrefix string `mapstructure:"warc-prefix"` - WARCOperator string `mapstructure:"warc-operator"` - CDXDedupeServer string `mapstructure:"warc-cdx-dedupe-server"` - WARCTempDir string `mapstructure:"warc-temp-dir"` - WARCSize int `mapstructure:"warc-size"` - CDXCookie string `mapstructure:"cdx-cookie"` - HQAddress string `mapstructure:"hq-address"` - HQKey string `mapstructure:"hq-key"` - HQSecret string `mapstructure:"hq-secret"` - HQProject string `mapstructure:"hq-project"` - HQStrategy string `mapstructure:"hq-strategy"` - HQBatchSize int64 `mapstructure:"hq-batch-size"` - HQBatchConcurrency int `mapstructure:"hq-batch-concurrency"` - LogFileOutputDir string `mapstructure:"log-file-output-dir"` - ElasticSearchUsername string `mapstructure:"es-user"` - ElasticSearchPassword string `mapstructure:"es-password"` - ElasticSearchIndexPrefix string `mapstructure:"es-index-prefix"` - DisableHTMLTag []string `mapstructure:"disable-html-tag"` - ExcludeHosts []string `mapstructure:"exclude-host"` - IncludeHosts []string `mapstructure:"include-host"` - IncludeString []string `mapstructure:"include-string"` - ExcludeString []string `mapstructure:"exclude-string"` - ElasticSearchURLs []string `mapstructure:"es-url"` - WorkersCount int `mapstructure:"workers"` - MaxConcurrentAssets int `mapstructure:"max-concurrent-assets"` - MaxHops uint8 `mapstructure:"max-hops"` - MaxRedirect uint8 `mapstructure:"max-redirect"` - MaxRetry uint8 `mapstructure:"max-retry"` - HTTPTimeout int `mapstructure:"http-timeout"` - MaxConcurrentRequestsPerDomain int `mapstructure:"max-concurrent-per-domain"` - ConcurrentSleepLength int `mapstructure:"concurrent-sleep-length"` - CrawlTimeLimit int `mapstructure:"crawl-time-limit"` - CrawlMaxTimeLimit int `mapstructure:"crawl-max-time-limit"` - MinSpaceRequired int `mapstructure:"min-space-required"` - WARCPoolSize int `mapstructure:"warc-pool-size"` - WARCDedupeSize int `mapstructure:"warc-dedupe-size"` - KeepCookies bool `mapstructure:"keep-cookies"` - Headless bool `mapstructure:"headless"` - DisableSeencheck bool `mapstructure:"disable-seencheck"` - JSON bool `mapstructure:"json"` - Debug bool `mapstructure:"debug"` - LiveStats bool `mapstructure:"live-stats"` - API bool `mapstructure:"api"` - Prometheus bool `mapstructure:"prometheus"` - DomainsCrawl bool `mapstructure:"domains-crawl"` - CaptureAlternatePages bool `mapstructure:"capture-alternate-pages"` - WARCOnDisk bool `mapstructure:"warc-on-disk"` - DisableLocalDedupe bool `mapstructure:"disable-local-dedupe"` - CertValidation bool `mapstructure:"cert-validation"` - DisableAssetsCapture bool `mapstructure:"disable-assets-capture"` - HQ bool // Special field to check if HQ is enabled depending on the command called - HQContinuousPull bool `mapstructure:"hq-continuous-pull"` - HQRateLimitSendBack bool `mapstructure:"hq-rate-limiting-send-back"` - NoStdoutLogging bool `mapstructure:"no-stdout-log"` - NoBatchWriteWAL bool `mapstructure:"ultrasafe-queue"` - Handover bool `mapstructure:"handover"` + LogLevel string `mapstructure:"log-level"` + + Job string `mapstructure:"job"` + JobPath string + + // UseSeencheck exists just for convenience of not checking + // !DisableSeencheck in the rest of the code, to make the code clearer + DisableSeencheck bool `mapstructure:"disable-seencheck"` + UseSeencheck bool + + UserAgent string `mapstructure:"user-agent"` + Cookies string `mapstructure:"cookies"` + APIPort string `mapstructure:"api-port"` + PrometheusPrefix string `mapstructure:"prometheus-prefix"` + WARCPrefix string `mapstructure:"warc-prefix"` + WARCOperator string `mapstructure:"warc-operator"` + CDXDedupeServer string `mapstructure:"warc-cdx-dedupe-server"` + WARCTempDir string `mapstructure:"warc-temp-dir"` + WARCSize int `mapstructure:"warc-size"` + CDXCookie string `mapstructure:"cdx-cookie"` + HQAddress string `mapstructure:"hq-address"` + HQKey string `mapstructure:"hq-key"` + HQSecret string `mapstructure:"hq-secret"` + HQProject string `mapstructure:"hq-project"` + HQStrategy string `mapstructure:"hq-strategy"` + HQBatchSize int64 `mapstructure:"hq-batch-size"` + HQBatchConcurrency int `mapstructure:"hq-batch-concurrency"` + LogFileOutputDir string `mapstructure:"log-file-output-dir"` + ElasticSearchUsername string `mapstructure:"es-user"` + ElasticSearchPassword string `mapstructure:"es-password"` + ElasticSearchIndexPrefix string `mapstructure:"es-index-prefix"` + DisableHTMLTag []string `mapstructure:"disable-html-tag"` + ExcludeHosts []string `mapstructure:"exclude-host"` + IncludeHosts []string `mapstructure:"include-host"` + IncludeString []string `mapstructure:"include-string"` + ExcludeString []string `mapstructure:"exclude-string"` + ElasticSearchURLs []string `mapstructure:"es-url"` + WorkersCount int `mapstructure:"workers"` + MaxConcurrentAssets int `mapstructure:"max-concurrent-assets"` + MaxHops uint8 `mapstructure:"max-hops"` + MaxRedirect uint8 `mapstructure:"max-redirect"` + MaxRetry uint8 `mapstructure:"max-retry"` + HTTPTimeout int `mapstructure:"http-timeout"` + CrawlTimeLimit int `mapstructure:"crawl-time-limit"` + CrawlMaxTimeLimit int `mapstructure:"crawl-max-time-limit"` + MinSpaceRequired int `mapstructure:"min-space-required"` + WARCPoolSize int `mapstructure:"warc-pool-size"` + WARCDedupeSize int `mapstructure:"warc-dedupe-size"` + KeepCookies bool `mapstructure:"keep-cookies"` + Headless bool `mapstructure:"headless"` + JSON bool `mapstructure:"json"` + Debug bool `mapstructure:"debug"` + LiveStats bool `mapstructure:"live-stats"` + API bool `mapstructure:"api"` + Prometheus bool `mapstructure:"prometheus"` + DomainsCrawl bool `mapstructure:"domains-crawl"` + CaptureAlternatePages bool `mapstructure:"capture-alternate-pages"` + WARCOnDisk bool `mapstructure:"warc-on-disk"` + DisableLocalDedupe bool `mapstructure:"disable-local-dedupe"` + CertValidation bool `mapstructure:"cert-validation"` + DisableAssetsCapture bool `mapstructure:"disable-assets-capture"` + HQ bool // Special field to check if HQ is enabled depending on the command called + HQContinuousPull bool `mapstructure:"hq-continuous-pull"` + HQRateLimitSendBack bool `mapstructure:"hq-rate-limiting-send-back"` + NoStdoutLogging bool `mapstructure:"no-stdout-log"` + NoBatchWriteWAL bool `mapstructure:"ultrasafe-queue"` + Handover bool `mapstructure:"handover"` // Network Proxy string `mapstructure:"proxy"` @@ -101,116 +100,6 @@ type Config struct { YTDLPPath string `mapstructure:"ytdlp-path"` } -// Crawl define the parameters of a crawl process -type Crawl struct { - *sync.Mutex - StartTime time.Time - // SeedList []queue.Item - // Paused *utils.TAtomBool - // Finished *utils.TAtomBool - LiveStats bool - - // Logger - Log *log.Logger - - // Queue (ex-frontier) - // Queue *queue.PersistentGroupedQueue - // Seencheck *seencheck.Seencheck - UseSeencheck bool - UseHandover bool - UseCommit bool - - // Worker pool - // Workers *WorkerPool - - // Crawl settings - MaxConcurrentAssets int - Client *warc.CustomHTTPClient - ClientProxied *warc.CustomHTTPClient - DisabledHTMLTags []string - ExcludedHosts []string - IncludedHosts []string - IncludedStrings []string - ExcludedStrings []string - UserAgent string - Job string - JobPath string - MaxHops uint8 - MaxRetry uint8 - MaxRedirect uint8 - HTTPTimeout int - MaxConcurrentRequestsPerDomain int - RateLimitDelay int - CrawlTimeLimit int - MaxCrawlTimeLimit int - DisableAssetsCapture bool - CaptureAlternatePages bool - DomainsCrawl bool - Headless bool - MinSpaceRequired int - - // Cookie-related settings - CookieFile string - KeepCookies bool - CookieJar http.CookieJar - - // Network settings - Proxy string - BypassProxy []string - RandomLocalIP bool - DisableIPv4 bool - DisableIPv6 bool - IPv6AnyIP bool - - // API settings - API bool - APIPort string - Prometheus bool - // PrometheusMetrics *PrometheusMetrics - - // Real time statistics - URIsPerSecond *ratecounter.RateCounter - ActiveWorkers *ratecounter.Counter - CrawledSeeds *ratecounter.Counter - CrawledAssets *ratecounter.Counter - - // WARC settings - WARCPrefix string - WARCOperator string - WARCWriter chan *warc.RecordBatch - WARCWriterFinish chan bool - WARCTempDir string - CDXDedupeServer string - WARCFullOnDisk bool - WARCPoolSize int - WARCDedupeSize int - WARCSize int - DisableLocalDedupe bool - CertValidation bool - WARCCustomCookie string - - // Crawl HQ settings - UseHQ bool - HQAddress string - HQProject string - HQKey string - HQSecret string - HQStrategy string - HQBatchConcurrency int - HQBatchSize int - HQContinuousPull bool - HQClient *gocrawlhq.Client - HQConsumerState string - // HQFinishedChannel chan *queue.Item - // HQProducerChannel chan *queue.Item - HQChannelsWg *sync.WaitGroup - HQRateLimitingSendBack bool - - // Dependencies - NoYTDLP bool - YTDLPPath string -} - var ( config *Config once sync.Once @@ -288,115 +177,49 @@ func BindFlags(flagSet *pflag.FlagSet) { // GetConfig returns the config struct func GetConfig() *Config { - cfg := config - if cfg == nil { - panic("Config not initialized. Call InitConfig() before accessing the config.") - } - return cfg + return config } -func GenerateCrawlConfig(config *Config) (*Crawl, error) { - var c = new(Crawl) - - // Statistics counters - c.CrawledSeeds = new(ratecounter.Counter) - c.CrawledAssets = new(ratecounter.Counter) - c.ActiveWorkers = new(ratecounter.Counter) - c.URIsPerSecond = ratecounter.NewRateCounter(1 * time.Second) - - c.LiveStats = config.LiveStats +func GenerateCrawlConfig() error { + config.LiveStats = config.LiveStats // If the job name isn't specified, we generate a random name if config.Job == "" { if config.HQProject != "" { - c.Job = config.HQProject + config.Job = config.HQProject } else { UUID, err := uuid.NewUUID() if err != nil { slog.Error("cmd/utils.go:InitCrawlWithCMD():uuid.NewUUID()", "error", err) - return nil, err + return err } - c.Job = UUID.String() + config.Job = UUID.String() } } else { - c.Job = config.Job + config.Job = config.Job } - c.JobPath = path.Join("jobs", config.Job) + config.JobPath = path.Join("jobs", config.Job) // TODO - // c.Workers = NewPool(uint(config.WorkersCount), time.Second*60, c) + // Crawl.Workers = NewPool(uint(config.WorkersCount), time.Second*60, c) - c.UseSeencheck = !config.DisableSeencheck - c.HTTPTimeout = config.HTTPTimeout - c.MaxConcurrentRequestsPerDomain = config.MaxConcurrentRequestsPerDomain - c.RateLimitDelay = config.ConcurrentSleepLength - c.CrawlTimeLimit = config.CrawlTimeLimit + config.UseSeencheck = !config.DisableSeencheck // Defaults --max-crawl-time-limit to 10% more than --crawl-time-limit if config.CrawlMaxTimeLimit == 0 && config.CrawlTimeLimit != 0 { - c.MaxCrawlTimeLimit = config.CrawlTimeLimit + (config.CrawlTimeLimit / 10) - } else { - c.MaxCrawlTimeLimit = config.CrawlMaxTimeLimit + config.CrawlMaxTimeLimit = config.CrawlTimeLimit + (config.CrawlTimeLimit / 10) } - c.MaxRetry = config.MaxRetry - c.MaxRedirect = config.MaxRedirect - c.MaxHops = config.MaxHops - c.DomainsCrawl = config.DomainsCrawl - c.DisableAssetsCapture = config.DisableAssetsCapture - c.DisabledHTMLTags = config.DisableHTMLTag - // We exclude some hosts by default - c.ExcludedHosts = utils.DedupeStrings(append(config.ExcludeHosts, "archive.org", "archive-it.org")) - - c.IncludedHosts = config.IncludeHosts - c.CaptureAlternatePages = config.CaptureAlternatePages - c.ExcludedStrings = config.ExcludeString - c.IncludedStrings = config.IncludeString + config.ExcludeHosts = utils.DedupeStrings(append(config.ExcludeHosts, "archive.org", "archive-it.org")) - c.MinSpaceRequired = config.MinSpaceRequired - - // WARC settings - c.WARCPrefix = config.WARCPrefix - c.WARCOperator = config.WARCOperator - - if config.WARCTempDir != "" { - c.WARCTempDir = config.WARCTempDir - } else { - c.WARCTempDir = path.Join(c.JobPath, "temp") + if config.WARCTempDir == "" { + config.WARCTempDir = path.Join(config.JobPath, "temp") } - c.CDXDedupeServer = config.CDXDedupeServer - c.DisableLocalDedupe = config.DisableLocalDedupe - c.CertValidation = config.CertValidation - c.WARCFullOnDisk = config.WARCOnDisk - c.WARCPoolSize = config.WARCPoolSize - c.WARCDedupeSize = config.WARCDedupeSize - c.WARCCustomCookie = config.CDXCookie - c.WARCSize = config.WARCSize - - c.API = config.API - c.APIPort = config.APIPort - - // If Prometheus is specified, then we make sure - // c.API is true - c.Prometheus = config.Prometheus - if c.Prometheus { - c.API = true - // TODO: Implement Prometheus metrics - // c.PrometheusMetrics = &PrometheusMetrics{} - // c.PrometheusMetrics.Prefix = config.PrometheusPrefix - } - - // Dependencies - c.NoYTDLP = config.NoYTDLP - c.YTDLPPath = config.YTDLPPath - - if config.UserAgent != "Zeno" { - c.UserAgent = config.UserAgent - } else { + if config.UserAgent == "" { version := getVersion() // If Version is a commit hash, we only take the first 7 characters @@ -404,54 +227,24 @@ func GenerateCrawlConfig(config *Config) (*Crawl, error) { version.Version = version.Version[:7] } - c.UserAgent = "Mozilla/5.0 (compatible; archive.org_bot +http://archive.org/details/archive.org_bot) Zeno/" + version.Version + " warc/" + version.WarcVersion - slog.Info("User-Agent set to", "user-agent", c.UserAgent) + config.UserAgent = "Mozilla/5.0 (compatible; archive.org_bot +http://archive.org/details/archive.org_bot) Zeno/" + version.Version + " warc/" + version.WarcVersion + slog.Info("User-Agent set to", "user-agent", config.UserAgent) } - c.Headless = config.Headless - - c.CookieFile = config.Cookies - c.KeepCookies = config.KeepCookies - // Network settings - c.Proxy = config.Proxy - c.BypassProxy = config.DomainsBypassProxy - c.RandomLocalIP = config.RandomLocalIP - - if c.RandomLocalIP { + if config.RandomLocalIP { slog.Warn("Random local IP is enabled") } - c.DisableIPv4 = config.DisableIPv4 - c.DisableIPv6 = config.DisableIPv6 - c.IPv6AnyIP = config.IPv6AnyIP - - if c.DisableIPv4 && c.DisableIPv6 { + if config.DisableIPv4 && config.DisableIPv6 { slog.Error("Both IPv4 and IPv6 are disabled, at least one of them must be enabled.") os.Exit(1) - } else if c.DisableIPv4 { + } else if config.DisableIPv4 { slog.Info("IPv4 is disabled") - } else if c.DisableIPv6 { + } else if config.DisableIPv6 { slog.Info("IPv6 is disabled") } - // Crawl HQ settings - c.UseHQ = config.HQ - c.HQProject = config.HQProject - c.HQAddress = config.HQAddress - c.HQKey = config.HQKey - c.HQSecret = config.HQSecret - c.HQStrategy = config.HQStrategy - c.HQBatchSize = int(config.HQBatchSize) - c.HQBatchConcurrency = config.HQBatchConcurrency - c.HQContinuousPull = config.HQContinuousPull - c.HQRateLimitingSendBack = config.HQRateLimitSendBack - - // Handover mechanism - c.UseHandover = config.Handover - - c.UseCommit = !config.NoBatchWriteWAL - - return c, nil + return nil } func handleFlagsEdgeCases() { @@ -469,7 +262,6 @@ func handleFlagsEdgeCases() { func handleFlagsAliases() { // For each flag we want to alias, we check if the original flag is at default and if the alias is not // If so, we set the original flag to the value of the alias - if viper.GetUint("hops") != 0 && viper.GetUint("max-hops") == 0 { viper.Set("max-hops", viper.GetUint("hops")) } diff --git a/main.go b/main.go index b616c731..2e86f043 100644 --- a/main.go +++ b/main.go @@ -10,6 +10,7 @@ package main import ( "fmt" + "log/slog" "os" "github.com/internetarchive/Zeno/cmd" @@ -20,4 +21,6 @@ func main() { fmt.Println(err) os.Exit(1) } + + slog.Info("and here we COOK") } From 79585241d24683be7a5f9220ebd7ef53ea04f2f6 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Mon, 18 Nov 2024 21:19:24 +0100 Subject: [PATCH 008/295] remove: useless utils directory --- internal/utils/ip.go | 37 ------------------------------------- 1 file changed, 37 deletions(-) delete mode 100644 internal/utils/ip.go diff --git a/internal/utils/ip.go b/internal/utils/ip.go deleted file mode 100644 index 0b135a33..00000000 --- a/internal/utils/ip.go +++ /dev/null @@ -1,37 +0,0 @@ -package utils - -import ( - "log/slog" - "net" - "os" -) - -// Note: GetOutboundIP does not establish any connection and the -// destination does not need to exist for this function to work. -func GetOutboundIP() net.IP { - var ( - conn net.Conn - err error - ) - - for { - conn, err = net.Dial("udp", "24.24.24.24:24200") - if err != nil { - slog.Error("error getting outbound IP, retrying", "err", err) - continue - } - defer conn.Close() - break - } - - return conn.LocalAddr().(*net.UDPAddr).IP -} - -func GetHostname() string { - hostname, err := os.Hostname() - if err != nil { - slog.Error("error getting hostname", "err", err) - } - - return hostname -} From b6d16ae28eb999ebf333912e0755be89b440e008 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Tue, 19 Nov 2024 11:10:16 +0100 Subject: [PATCH 009/295] update go.mod and reactor: make inserted default to a default inserted type and stats: add draft of package --- go.mod | 25 +++----- go.sum | 45 ++++++-------- internal/pkg/reactor/reactor.go | 9 ++- internal/pkg/reactor/reactor_test.go | 7 ++- internal/pkg/stats/error.go | 10 ++++ internal/pkg/stats/rate.go | 36 +++++++++++ internal/pkg/stats/rate_test.go | 90 ++++++++++++++++++++++++++++ internal/pkg/stats/stats.go | 29 +++++++++ pkg/models/seed.go | 4 +- 9 files changed, 204 insertions(+), 51 deletions(-) create mode 100644 internal/pkg/stats/error.go create mode 100644 internal/pkg/stats/rate.go create mode 100644 internal/pkg/stats/rate_test.go create mode 100644 internal/pkg/stats/stats.go diff --git a/go.mod b/go.mod index 1328ec52..4cf1333c 100644 --- a/go.mod +++ b/go.mod @@ -3,49 +3,38 @@ module github.com/internetarchive/Zeno go 1.23.3 require ( + github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 github.com/google/uuid v1.6.0 github.com/internetarchive/gocrawlhq v1.2.20 + github.com/spf13/cobra v1.8.1 + github.com/spf13/pflag v1.0.5 + github.com/spf13/viper v1.19.0 + go.uber.org/goleak v1.3.0 + golang.org/x/net v0.31.0 ) require ( - github.com/CorentinB/warc v0.8.53 // indirect - github.com/andybalholm/brotli v1.1.0 // indirect - github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect - github.com/cloudflare/circl v1.4.0 // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect github.com/gobwas/httphead v0.1.0 // indirect github.com/gobwas/pool v0.2.1 // indirect github.com/gobwas/ws v1.4.0 // indirect + github.com/google/go-cmp v0.6.0 // indirect github.com/hashicorp/hcl v1.0.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect - github.com/klauspost/compress v1.17.10 // indirect github.com/magiconair/properties v1.8.7 // indirect - github.com/miekg/dns v1.1.62 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect - github.com/paulbellamy/ratecounter v0.2.0 // indirect github.com/pelletier/go-toml/v2 v2.2.2 // indirect - github.com/refraction-networking/utls v1.6.7 // indirect github.com/sagikazarmark/locafero v0.4.0 // indirect github.com/sagikazarmark/slog-shim v0.1.0 // indirect github.com/sourcegraph/conc v0.3.0 // indirect github.com/spf13/afero v1.11.0 // indirect github.com/spf13/cast v1.6.0 // indirect - github.com/spf13/cobra v1.8.1 // indirect - github.com/spf13/pflag v1.0.5 // indirect - github.com/spf13/viper v1.19.0 // indirect github.com/subosito/gotenv v1.6.0 // indirect - github.com/ulikunitz/xz v0.5.12 // indirect go.uber.org/atomic v1.9.0 // indirect - go.uber.org/goleak v1.3.0 // indirect go.uber.org/multierr v1.9.0 // indirect - golang.org/x/crypto v0.29.0 // indirect golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect - golang.org/x/mod v0.21.0 // indirect - golang.org/x/net v0.31.0 // indirect - golang.org/x/sync v0.9.0 // indirect golang.org/x/sys v0.27.0 // indirect golang.org/x/text v0.20.0 // indirect - golang.org/x/tools v0.25.0 // indirect gopkg.in/ini.v1 v1.67.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index 036c85fd..3cf46f98 100644 --- a/go.sum +++ b/go.sum @@ -1,14 +1,12 @@ -github.com/CorentinB/warc v0.8.53 h1:xVz3RMdZ6faAqTtLfcK1/yl8ZTansy+B2en//EZLUlM= -github.com/CorentinB/warc v0.8.53/go.mod h1:NblONkMtoBB4TIigew6F6vakzu0z3YQTKNFS8U2FIn8= -github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= -github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY= github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 h1:DklsrG3dyBCFEj5IhUbnKptjxatkF07cF2ak3yi77so= github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw= -github.com/cloudflare/circl v1.4.0 h1:BV7h5MgrktNzytKmWjpOtdYrf0lkkbF8YMlBGPhJQrY= -github.com/cloudflare/circl v1.4.0/go.mod h1:PDRU+oXvdD7KCtgKxW95M5Z8BpSCJXQORiZFnBQS5QU= github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= +github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU= @@ -17,6 +15,8 @@ github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og= github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw= github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs= github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= @@ -25,21 +25,21 @@ github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2 github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/internetarchive/gocrawlhq v1.2.20 h1:0mIIt9lhPacKr6L2JeISoopQ8EgzC3dISJ3ITGGbOp4= github.com/internetarchive/gocrawlhq v1.2.20/go.mod h1:gHrdMewIi5OBWE/xEZGqSrNHyTXPbt+h+XUWpp9fZek= -github.com/klauspost/compress v1.17.10 h1:oXAz+Vh0PMUvJczoi+flxpnBEPxoER1IaAnU/NMPtT0= -github.com/klauspost/compress v1.17.10/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= -github.com/miekg/dns v1.1.62 h1:cN8OuEF1/x5Rq6Np+h1epln8OiyPWV+lROx9LxcGgIQ= -github.com/miekg/dns v1.1.62/go.mod h1:mvDlcItzm+br7MToIKqkglaGhlFMHJ9DTNNWONWXbNQ= github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= -github.com/paulbellamy/ratecounter v0.2.0 h1:2L/RhJq+HA8gBQImDXtLPrDXK5qAj6ozWVK/zFXVJGs= -github.com/paulbellamy/ratecounter v0.2.0/go.mod h1:Hfx1hDpSGoqxkVVpBi/IlYD7kChlfo5C6hzIHwPqfFE= github.com/pelletier/go-toml/v2 v2.2.2 h1:aYUidT7k73Pcl9nb2gScu7NSrKCSHIDE89b3+6Wq+LM= github.com/pelletier/go-toml/v2 v2.2.2/go.mod h1:1t835xjRzz80PqgE6HHgN2JOsmgYu/h4qDAS4n929Rs= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/refraction-networking/utls v1.6.7 h1:zVJ7sP1dJx/WtVuITug3qYUq034cDq9B2MR1K67ULZM= -github.com/refraction-networking/utls v1.6.7/go.mod h1:BC3O4vQzye5hqpmDTWUqi4P5DDhzJfkV1tdqtawQIH0= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8= +github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sagikazarmark/locafero v0.4.0 h1:HApY1R9zGo4DBgr7dqsTH/JJxLTTsOt7u6keLGt6kNQ= github.com/sagikazarmark/locafero v0.4.0/go.mod h1:Pe1W6UlPYUk/+wc/6KFhbORCfqzgYEpgQ3O5fPuL3H4= @@ -65,39 +65,28 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8= github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU= -github.com/ulikunitz/xz v0.5.12 h1:37Nm15o69RwBkXM0J6A5OlE67RZTfzUxTj8fB3dfcsc= -github.com/ulikunitz/xz v0.5.12/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14= go.uber.org/atomic v1.9.0 h1:ECmE8Bn/WFTYwEW/bpKD3M8VtR/zQVbavAoalC1PYyE= go.uber.org/atomic v1.9.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.9.0 h1:7fIwc/ZtS0q++VgcfqFDxSBZVv/Xo49/SYnDFupUwlI= go.uber.org/multierr v1.9.0/go.mod h1:X2jQV1h+kxSjClGpnseKVIxpmcjrj7MNnI0bnlfKTVQ= -golang.org/x/crypto v0.29.0 h1:L5SG1JTTXupVV3n6sUqMTeWbjAyfPwoda2DLX8J8FrQ= -golang.org/x/crypto v0.29.0/go.mod h1:+F4F4N5hv6v38hfeYwTdx20oUvLLc+QfrE9Ax9HtgRg= golang.org/x/exp v0.0.0-20230905200255-921286631fa9 h1:GoHiUyI/Tp2nVkLI2mCxVkOjsbSXD66ic0XW0js0R9g= golang.org/x/exp v0.0.0-20230905200255-921286631fa9/go.mod h1:S2oDrQGGwySpoQPVqRShND87VCbxmc6bL1Yd2oYrm6k= -golang.org/x/mod v0.21.0 h1:vvrHzRwRfVKSiLrG+d4FMl/Qi4ukBCE6kZlTUkDYRT0= -golang.org/x/mod v0.21.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= golang.org/x/net v0.31.0 h1:68CPQngjLL0r2AlUKiSxtQFKvzRVbnzLwMUn5SzcLHo= golang.org/x/net v0.31.0/go.mod h1:P4fl1q7dY2hnZFxEk4pPSkDHF+QqjitcnDjUQyMM+pM= -golang.org/x/sync v0.9.0 h1:fEo0HyrW1GIgZdpbhCRO0PkJajUS5H9IFUztCgEo2jQ= -golang.org/x/sync v0.9.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= -golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s= golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= -golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug= golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4= -golang.org/x/tools v0.25.0 h1:oFU9pkj/iJgs+0DT+VMHrx+oBKs/LJMV+Uvg78sl+fE= -golang.org/x/tools v0.25.0/go.mod h1:/vtpO8WL1N9cQC3FN5zPqb//fRXskFHbLKk4OW1Q7rg= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/pkg/reactor/reactor.go b/internal/pkg/reactor/reactor.go index 0ebd6e97..5ddaf5d9 100644 --- a/internal/pkg/reactor/reactor.go +++ b/internal/pkg/reactor/reactor.go @@ -63,6 +63,7 @@ func Stop() { } // ReceiveFeedback sends an item to the feedback channel. +// If the item is not present on the state table it gets discarded func ReceiveFeedback(item *models.Seed) error { if globalReactor == nil { return ErrReactorNotInitialized @@ -82,14 +83,18 @@ func ReceiveFeedback(item *models.Seed) error { } } -// ReceiveSource sends an item to the source seeds channel. -func ReceiveSource(item *models.Seed) error { +// ReceiveInsert sends an item to the input channel consuming a token. +// It is the responsibility of the sender to set either SeedSourceQueue or SeedSourceHQ, if not set seed will get forced SeedSourceInsert +func ReceiveInsert(item *models.Seed) error { if globalReactor == nil { return ErrReactorNotInitialized } select { case globalReactor.tokenPool <- struct{}{}: + if item.Source != models.SeedSourceQueue && item.Source != models.SeedSourceHQ { + item.Source = models.SeedSourceInsert + } globalReactor.stateTable.Store(item.UUID.String(), item) globalReactor.input <- item return nil diff --git a/internal/pkg/reactor/reactor_test.go b/internal/pkg/reactor/reactor_test.go index 694dc03d..79b2cefa 100644 --- a/internal/pkg/reactor/reactor_test.go +++ b/internal/pkg/reactor/reactor_test.go @@ -13,7 +13,7 @@ import ( func TestReactorE2E(t *testing.T) { // Initialize the reactor with a maximum of 5 tokens outputChan := make(chan *models.Seed) - err := Start(5, outputChan) + err := Start(1, outputChan) if err != nil { t.Logf("Error starting reactor: %s", err) return @@ -26,6 +26,9 @@ func TestReactorE2E(t *testing.T) { for { select { case item := <-outputChan: + if item == nil { + continue + } // Send feedback for the consumed item if item.Source != models.SeedSourceFeedback { err := ReceiveFeedback(item) @@ -62,7 +65,7 @@ func TestReactorE2E(t *testing.T) { // Queue mock seeds to the source channel for _, seed := range mockSeeds { - err := ReceiveSource(seed) + err := ReceiveInsert(seed) if err != nil { t.Fatalf("Error queuing seed to source channel: %s", err) } diff --git a/internal/pkg/stats/error.go b/internal/pkg/stats/error.go new file mode 100644 index 00000000..3df886ec --- /dev/null +++ b/internal/pkg/stats/error.go @@ -0,0 +1,10 @@ +package stats + +import "errors" + +var ( + // ErrStatsNotInitialized is returned when the stats package is not initialized + ErrStatsNotInitialized = errors.New("stats not initialized") + // ErrStatsAlreadyInitialized is returned when the stats package is already initialized + ErrStatsAlreadyInitialized = errors.New("stats already initialized") +) diff --git a/internal/pkg/stats/rate.go b/internal/pkg/stats/rate.go new file mode 100644 index 00000000..4ba24cc7 --- /dev/null +++ b/internal/pkg/stats/rate.go @@ -0,0 +1,36 @@ +package stats + +import ( + "sync/atomic" + "time" +) + +type rate struct { + count uint64 + lastCount uint64 + lastUpdate int64 +} + +func (rps *rate) incr(step uint64) { + atomic.AddUint64(&rps.count, step) +} + +func (rps *rate) get() uint64 { + now := time.Now().Unix() + lastUpdate := atomic.LoadInt64(&rps.lastUpdate) + + if now == lastUpdate { + return atomic.LoadUint64(&rps.lastCount) + } + + currentCount := atomic.LoadUint64(&rps.count) + lastCount := atomic.SwapUint64(&rps.count, 0) + atomic.StoreUint64(&rps.lastCount, lastCount) + atomic.StoreInt64(&rps.lastUpdate, now) + + return currentCount +} + +func (rps *rate) getTotal() uint64 { + return atomic.LoadUint64(&rps.count) +} diff --git a/internal/pkg/stats/rate_test.go b/internal/pkg/stats/rate_test.go new file mode 100644 index 00000000..e5424e7c --- /dev/null +++ b/internal/pkg/stats/rate_test.go @@ -0,0 +1,90 @@ +package stats + +import ( + "sync/atomic" + "testing" + "time" +) + +func TestRate_Start(t *testing.T) { + rate := &rate{} + + // Increment the rate counter + rate.incr(5) + + // Wait for more than a second to allow the ticker to update the rate + time.Sleep(1100 * time.Millisecond) + + // Check if the rate per second is correctly updated + if rate.get() != 5 { + t.Errorf("expected rate per second to be 5, got %d", rate.get()) + } + + // Increment the rate counter again + rate.incr(10) + + // Wait for more than a second to allow the ticker to update the rate + time.Sleep(1100 * time.Millisecond) + + // Check if the rate per second is correctly updated + if rate.get() != 10 { + t.Errorf("expected rate per second to be 10, got %d", rate.get()) + } + + // Increment the rate counter multiple times and check the rate over several seconds + for i := 0; i < 5; i++ { + rate.incr(2) + time.Sleep(1100 * time.Millisecond) + expectedRate := uint64(2) + if rate.get() != expectedRate { + t.Errorf("expected rate per second to be %d, got %d", expectedRate, rate.get()) + } + } +} + +func TestRate_Incr(t *testing.T) { + rate := &rate{} + + // Increment the rate counter + rate.incr(3) + + // Check if the count is correctly incremented + if atomic.LoadUint64(&rate.count) != 3 { + t.Errorf("expected count to be 3, got %d", atomic.LoadUint64(&rate.count)) + } + + // Increment the rate counter again + rate.incr(2) + + // Check if the count is correctly incremented + if atomic.LoadUint64(&rate.count) != 5 { + t.Errorf("expected count to be 5, got %d", atomic.LoadUint64(&rate.count)) + } +} + +func TestRate_Get(t *testing.T) { + rate := &rate{} + + // Increment the rate counter + rate.incr(7) + + // Wait for more than a second to allow the ticker to update the rate + time.Sleep(1100 * time.Millisecond) + + // Check if the rate per second is correctly updated + if rate.get() != 7 { + t.Errorf("expected rate per second to be 7, got %d", rate.get()) + } +} + +func TestRate_GetTotal(t *testing.T) { + rate := &rate{} + + // Increment the rate counter + rate.incr(7) + + // Check if the total count is correctly retrieved + if rate.getTotal() != 7 { + t.Errorf("expected total count to be 7, got %d", rate.getTotal()) + } +} diff --git a/internal/pkg/stats/stats.go b/internal/pkg/stats/stats.go new file mode 100644 index 00000000..43f0f34a --- /dev/null +++ b/internal/pkg/stats/stats.go @@ -0,0 +1,29 @@ +package stats + +import "sync" + +type stats struct { + URLsCrawled *rate + SeedsFinished *rate +} + +var ( + globalStats *stats + doOnce sync.Once +) + +func Init() error { + var done = false + doOnce.Do(func() { + globalStats = &stats{ + URLsCrawled: &rate{}, + SeedsFinished: &rate{}, + } + done = true + }) + + if !done { + return ErrStatsAlreadyInitialized + } + return nil +} diff --git a/pkg/models/seed.go b/pkg/models/seed.go index 7c412d0b..3d7ff260 100644 --- a/pkg/models/seed.go +++ b/pkg/models/seed.go @@ -37,8 +37,10 @@ const ( type SeedSource int const ( + // SeedSourceInsert is for seeds which source is not defined when inserted on reactor + SeedSourceInsert SeedSource = iota // SeedSourceQueue is for seeds that are from the Queue - SeedSourceQueue SeedSource = iota + SeedSourceQueue // SeedSourceHQ is for seeds that are from the HQ SeedSourceHQ // SeedSourceFeedback is for seeds that are from the Feedback From fd9fcf627ec4bc0c7bb9e6ba0ac1cbba9249795f Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Tue, 19 Nov 2024 11:39:04 +0100 Subject: [PATCH 010/295] reactor: package documentation --- internal/pkg/reactor/README.md | 120 ++++++++++++++++++++++++++++++++ internal/pkg/reactor/reactor.go | 14 ++-- 2 files changed, 127 insertions(+), 7 deletions(-) create mode 100644 internal/pkg/reactor/README.md diff --git a/internal/pkg/reactor/README.md b/internal/pkg/reactor/README.md new file mode 100644 index 00000000..9792b996 --- /dev/null +++ b/internal/pkg/reactor/README.md @@ -0,0 +1,120 @@ +# Reactor Package Documentation +## Overview +The reactor package provides functionality to manage and control the processing of seeds. It includes mechanisms for inserting seeds, receiving feedback, and marking seeds as finished. The package ensures that operations are atomic and synchronized, maintaining consistency and avoiding race conditions. + +The reactor package is designed to be used in a concurrent environment, where multiple goroutines may interact with the reactor. It uses channels and a state table to manage the flow of seeds and their processing status. The package is thread-safe and provides error handling for common scenarios. + +A token-based system is used to limit the number of seeds processed concurrently. The reactor can be initialized with a maximum number of tokens, which determines the number of seeds that can be processed simultaneously. This helps prevent overloading the system and ensures efficient resource utilization. + +## Installation +To use the reactor package, import it into your package: +```go +import "github.com/internetarchive/Zeno/pkg/reactor" +``` + +## Usage +### Initialization +Before using the reactor, you need to initialize it with a maximum number of tokens and an output channel: +```go +outputChan := make(chan *models.Seed) +err := reactor.Start(5, outputChan) +if err != nil { + log.Fatalf("Error starting reactor: %v", err) +} +defer reactor.Stop() +``` +The initialization should happen once or it will error out with +``` +ErrErrReactorAlreadyInitialized || ErrReactorNotInitialized +``` + +### Inserting Seeds +To insert a seed into the reactor, use the ReceiveInsert function: +```go +seed := &models.Seed{ + UUID: uuid.New(), + URL: &gocrawlhq.URL{Value: "http://example.com"}, + Status: models.SeedFresh, + Source: models.SeedSourceHQ, +} + +err := reactor.ReceiveInsert(seed) +if err != nil { + log.Fatalf("Error inserting seed: %v", err) +} +``` +Inserting a seed will consume a token if available, allowing the seed to be processed. If no tokens are available, the function will block until a token is released and the seed can be inserted into the reactor. + +### Feedback a Seed +To send a seed for feedback, use the ReceiveFeedback function: +```go +err := reactor.ReceiveFeedback(seed) +if err != nil { + log.Fatalf("Error sending feedback: %v", err) +} +``` +Feedback can be used to reprocess a seed that got assets added. The seed will be reinserted into the reactor without consuming a token, cause it already consumed a token when inserted. + +### Marking Seeds as Finished +To mark a seed as finished, use the MarkAsFinished function: +```go +err := reactor.MarkAsFinished(seed) +if err != nil { + log.Fatalf("Error marking seed as finished: %v", err) +} +``` +Marking a seed as finished will release a token if the seed was inserted first. If the seed was not inserted, the function will error out with : +```go +ErrFinisehdItemNotFound +``` + +## Internals +### Reactor Struct +The reactor struct holds the state and channels for managing seed processing: +```go +type reactor struct { + tokenPool chan struct{} // Token pool to control asset count + ctx context.Context // Context for stopping the reactor + cancelFunc context.CancelFunc // Context's cancel func + input chan *models.Seed // Combined input channel for source and feedback + output chan *models.Seed // Output channel + stateTable sync.Map // State table for tracking seeds by UUID + wg sync.WaitGroup // WaitGroup to manage goroutines +} +``` + +Start Function +The Start function initializes the global reactor with the given maximum tokens and output channel. It starts the reactor's main loop in a goroutine: + +Stop Function +The Stop function stops the global reactor and waits for all goroutines to finish: + +Atomic Store and Send +The atomicStoreAndSend function performs a sync.Map store and a channel send atomically: + +ReceiveFeedback Function +The ReceiveFeedback function sends an item to the feedback channel and ensures it is present in the state table: + +ReceiveInsert Function +The ReceiveInsert function sends an item to the input channel and consumes a token: + +MarkAsFinished Function +The MarkAsFinished function marks an item as finished and releases a token if found in the state table: + +Run Function +The run function is the main loop of the reactor, which processes items from the input channel and sends them to the output channel: + +GetStateTable Function +The GetStateTable function returns a slice of all the seeds UUIDs as strings in the state table: + +Error Handling +The reactor package defines several error variables for common error scenarios: + +Testing +End-to-End Test +The TestReactorE2E function provides an end-to-end test for the reactor package: + +This test initializes the reactor, inserts mock seeds, processes them, and verifies that the state table is empty after processing. + +Conclusion +The reactor package provides a robust and synchronized mechanism for managing seed processing. By following the usage instructions and understanding the internals, you can effectively integrate and utilize the reactor in your application. \ No newline at end of file diff --git a/internal/pkg/reactor/reactor.go b/internal/pkg/reactor/reactor.go index 5ddaf5d9..42c9576a 100644 --- a/internal/pkg/reactor/reactor.go +++ b/internal/pkg/reactor/reactor.go @@ -11,13 +11,13 @@ import ( // reactor struct holds the state and channels for managing seeds processing. type reactor struct { - tokenPool chan struct{} // Token pool to control asset count - ctx context.Context // Context for stopping the reactor - cancelFunc context.CancelFunc - input chan *models.Seed // Combined input channel for source and feedback - output chan *models.Seed // Output channel - stateTable sync.Map // State table for tracking seeds by UUID - wg sync.WaitGroup // WaitGroup to manage goroutines + tokenPool chan struct{} // Token pool to control asset count + ctx context.Context // Context for stopping the reactor + cancelFunc context.CancelFunc // Context's cancel func + input chan *models.Seed // Combined input channel for source and feedback + output chan *models.Seed // Output channel + stateTable sync.Map // State table for tracking seeds by UUID + wg sync.WaitGroup // WaitGroup to manage goroutines } var ( From 850a319ef9ed1eebcf7d914a2386b8f676810487 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Tue, 19 Nov 2024 11:41:43 +0100 Subject: [PATCH 011/295] reactor: prev. commit ammend --- internal/pkg/reactor/README.md | 52 +++++++++++----------------------- 1 file changed, 17 insertions(+), 35 deletions(-) diff --git a/internal/pkg/reactor/README.md b/internal/pkg/reactor/README.md index 9792b996..41587221 100644 --- a/internal/pkg/reactor/README.md +++ b/internal/pkg/reactor/README.md @@ -83,38 +83,20 @@ type reactor struct { } ``` -Start Function -The Start function initializes the global reactor with the given maximum tokens and output channel. It starts the reactor's main loop in a goroutine: - -Stop Function -The Stop function stops the global reactor and waits for all goroutines to finish: - -Atomic Store and Send -The atomicStoreAndSend function performs a sync.Map store and a channel send atomically: - -ReceiveFeedback Function -The ReceiveFeedback function sends an item to the feedback channel and ensures it is present in the state table: - -ReceiveInsert Function -The ReceiveInsert function sends an item to the input channel and consumes a token: - -MarkAsFinished Function -The MarkAsFinished function marks an item as finished and releases a token if found in the state table: - -Run Function -The run function is the main loop of the reactor, which processes items from the input channel and sends them to the output channel: - -GetStateTable Function -The GetStateTable function returns a slice of all the seeds UUIDs as strings in the state table: - -Error Handling -The reactor package defines several error variables for common error scenarios: - -Testing -End-to-End Test -The TestReactorE2E function provides an end-to-end test for the reactor package: - -This test initializes the reactor, inserts mock seeds, processes them, and verifies that the state table is empty after processing. - -Conclusion -The reactor package provides a robust and synchronized mechanism for managing seed processing. By following the usage instructions and understanding the internals, you can effectively integrate and utilize the reactor in your application. \ No newline at end of file +### Maintaining Equilibrium +The reactor maintains equilibrium in the system through the following mechanisms: + +1. Token-Based Concurrency Control: + - The token pool limits the number of concurrent seeds being processed. + - Each seed consumes a token when inserted and releases it when marked as finished. + - This prevents overloading the system and ensures efficient resource utilization. +2. Channel Operations: + - The reactor uses a channel-based synchronization mechanism with a buffer on the input channel ensuring that no deadlock can happen. + - The output channel is expected to be unbuffered. +3. State Management: + - The state table tracks the state of each seed by its UUID. + - The state map is held to check that every seed that goes into feedback was already ingested first, ensuring a fixed amount of seeds in the system. + - This allows the reactor to manage seeds efficiently and handle feedback and completion correctly. + +## Conclusion +The `reactor` package provides a robust and synchronized mechanism for managing seed processing in a concurrent environment. By using channels, a state table, and a token-based system, it ensures efficient resource utilization and maintains equilibrium in the system. This architecture allows for scalable and reliable seed processing without sacrificing efficiency \ No newline at end of file From 35423403b90b9eed5727c58d4570686eeebc315d Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Tue, 19 Nov 2024 11:43:00 +0100 Subject: [PATCH 012/295] fix: variable name for cancel func in reactor --- internal/pkg/reactor/reactor.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/internal/pkg/reactor/reactor.go b/internal/pkg/reactor/reactor.go index 42c9576a..59aef1af 100644 --- a/internal/pkg/reactor/reactor.go +++ b/internal/pkg/reactor/reactor.go @@ -13,7 +13,7 @@ import ( type reactor struct { tokenPool chan struct{} // Token pool to control asset count ctx context.Context // Context for stopping the reactor - cancelFunc context.CancelFunc // Context's cancel func + cancel context.CancelFunc // Context's cancel func input chan *models.Seed // Combined input channel for source and feedback output chan *models.Seed // Output channel stateTable sync.Map // State table for tracking seeds by UUID @@ -33,11 +33,11 @@ func Start(maxTokens int, outputChan chan *models.Seed) error { once.Do(func() { ctx, cancel := context.WithCancel(context.Background()) globalReactor = &reactor{ - tokenPool: make(chan struct{}, maxTokens), - ctx: ctx, - cancelFunc: cancel, - input: make(chan *models.Seed, maxTokens), - output: outputChan, + tokenPool: make(chan struct{}, maxTokens), + ctx: ctx, + cancel: cancel, + input: make(chan *models.Seed, maxTokens), + output: outputChan, } globalReactor.wg.Add(1) go globalReactor.run() @@ -55,7 +55,7 @@ func Start(maxTokens int, outputChan chan *models.Seed) error { // Stop stops the global reactor and waits for all goroutines to finish. func Stop() { if globalReactor != nil { - globalReactor.cancelFunc() + globalReactor.cancel() globalReactor.wg.Wait() close(globalReactor.output) fmt.Println("Reactor stopped") From 989a4be26698a1fecf93a9d9d831f5f5e145cfd5 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Tue, 19 Nov 2024 11:45:11 +0100 Subject: [PATCH 013/295] fix: variable name for cancel func in reactor's documentation, logging --- internal/pkg/reactor/README.md | 2 +- internal/pkg/reactor/reactor.go | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/internal/pkg/reactor/README.md b/internal/pkg/reactor/README.md index 41587221..f9369886 100644 --- a/internal/pkg/reactor/README.md +++ b/internal/pkg/reactor/README.md @@ -75,7 +75,7 @@ The reactor struct holds the state and channels for managing seed processing: type reactor struct { tokenPool chan struct{} // Token pool to control asset count ctx context.Context // Context for stopping the reactor - cancelFunc context.CancelFunc // Context's cancel func + cancel context.CancelFunc // Context's cancel func input chan *models.Seed // Combined input channel for source and feedback output chan *models.Seed // Output channel stateTable sync.Map // State table for tracking seeds by UUID diff --git a/internal/pkg/reactor/reactor.go b/internal/pkg/reactor/reactor.go index 59aef1af..7ca8e6a7 100644 --- a/internal/pkg/reactor/reactor.go +++ b/internal/pkg/reactor/reactor.go @@ -3,7 +3,7 @@ package reactor import ( "context" - "fmt" + "log/slog" "sync" "github.com/internetarchive/Zeno/pkg/models" @@ -41,7 +41,7 @@ func Start(maxTokens int, outputChan chan *models.Seed) error { } globalReactor.wg.Add(1) go globalReactor.run() - fmt.Println("Reactor started") + slog.Info("reactor started") done = true }) @@ -58,7 +58,7 @@ func Stop() { globalReactor.cancel() globalReactor.wg.Wait() close(globalReactor.output) - fmt.Println("Reactor stopped") + slog.Info("reactor stopped") } } @@ -123,7 +123,7 @@ func (r *reactor) run() { select { // Closes the run routine when context is canceled case <-r.ctx.Done(): - fmt.Println("Reactor shutting down...") + slog.Info("reactor shutting down") return // Feeds items to the output channel From 6eef3359554e7a6b2dcfbd772fe282a098034b2a Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Tue, 19 Nov 2024 13:11:00 +0100 Subject: [PATCH 014/295] add: preprocessor architecture --- cmd/cmd.go | 2 +- internal/pkg/config/config.go | 8 +- internal/pkg/preprocessor/README.md | 53 +++++++++++ internal/pkg/preprocessor/error.go | 17 ++++ internal/pkg/preprocessor/preprocessor.go | 108 ++++++++++++++++++++++ internal/pkg/preprocessor/url.go | 10 ++ internal/pkg/reactor/reactor.go | 22 ++--- internal/pkg/reactor/reactor_test.go | 16 ++-- main.go | 23 ++++- pkg/models/seed.go | 64 ++++++------- 10 files changed, 264 insertions(+), 59 deletions(-) create mode 100644 internal/pkg/preprocessor/README.md create mode 100644 internal/pkg/preprocessor/error.go create mode 100644 internal/pkg/preprocessor/preprocessor.go create mode 100644 internal/pkg/preprocessor/url.go diff --git a/cmd/cmd.go b/cmd/cmd.go index c6002835..3c44c307 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -27,7 +27,7 @@ Authors: return fmt.Errorf("error initializing config: %s", err) } - cfg = config.GetConfig() + cfg = config.Get() return nil }, Run: func(cmd *cobra.Command, args []string) { diff --git a/internal/pkg/config/config.go b/internal/pkg/config/config.go index 8572a780..16e0ae0b 100644 --- a/internal/pkg/config/config.go +++ b/internal/pkg/config/config.go @@ -175,14 +175,12 @@ func BindFlags(flagSet *pflag.FlagSet) { }) } -// GetConfig returns the config struct -func GetConfig() *Config { +// Get returns the config struct +func Get() *Config { return config } func GenerateCrawlConfig() error { - config.LiveStats = config.LiveStats - // If the job name isn't specified, we generate a random name if config.Job == "" { if config.HQProject != "" { @@ -196,8 +194,6 @@ func GenerateCrawlConfig() error { config.Job = UUID.String() } - } else { - config.Job = config.Job } config.JobPath = path.Join("jobs", config.Job) diff --git a/internal/pkg/preprocessor/README.md b/internal/pkg/preprocessor/README.md new file mode 100644 index 00000000..db1cb835 --- /dev/null +++ b/internal/pkg/preprocessor/README.md @@ -0,0 +1,53 @@ +# Preprocessor Package Documentation +## Overview +The preprocessor package provides functionality to prepare seeds for capture. It includes mechanisms for validating URLs and preprocessing items before they are sent for capture. The package ensures that operations are atomic and synchronized, maintaining consistency and avoiding race conditions. + +The preprocessor package is designed to be used in a concurrent environment, where multiple goroutines may interact with the preprocessor. It uses channels to manage the flow of items and their preprocessing status. The package is thread-safe and provides error handling for common scenarios. + +## Installation +To use the preprocessor package, import it into your package: +```go +import "github.com/internetarchive/Zeno/internal/pkg/preprocessor" +``` + +## Usage +### Initialization +Before using the preprocessor, you need to initialize it with input and output channels: +```go +inputChan := make(chan *models.Item) +outputChan := make(chan *models.Item) +err := preprocessor.Start(inputChan, outputChan) +if err != nil { + log.Fatalf("Error starting preprocessor: %v", err) +} +defer preprocessor.Stop() +``` +The initialization should happen once or it will error out with +``` +ErrPreprocessorAlreadyInitialized || ErrPreprocessorNotInitialized +``` + +### Preprocessing Items +To preprocess an item, send it to the input channel: +```go +item := &models.Item{ + UUID: uuid.New(), + URL: &gocrawlhq.URL{Value: "http://example.com"}, + Status: models.ItemFresh, +} +inputChan <- item +``` +The preprocessed item will be sent to the output channel after preprocessing. + +## Internals +### Preprocessor Struct +The preprocessor struct holds the state and channels for managing item preprocessing: +```go +type preprocessor struct { + wg sync.WaitGroup + ctx context.Context + cancel context.CancelFunc + input chan *models.Item + output chan *models.Item +} +``` \ No newline at end of file diff --git a/internal/pkg/preprocessor/error.go b/internal/pkg/preprocessor/error.go new file mode 100644 index 00000000..53005de9 --- /dev/null +++ b/internal/pkg/preprocessor/error.go @@ -0,0 +1,17 @@ +package preprocessor + +import "errors" + +var ( + // ErrPreprocessorAlreadyInitialized is the error returned when the preprocessor is already initialized + ErrPreprocessorAlreadyInitialized = errors.New("preprocessor already initialized") + // ErrPreprocessorNotInitialized is the error returned when the preprocessor is not initialized + ErrPreprocessorNotInitialized = errors.New("preprocessor not initialized") + // ErrPreprocessorShuttingDown is the error returned when the preprocessor is shutting down + ErrPreprocessorShuttingDown = errors.New("preprocessor shutting down") + + // ErrFeedbackItemNotPresent is the error returned when an item was sent to the feedback channel but not found in the state table + ErrFeedbackItemNotPresent = errors.New("feedback item not present in state table") + // ErrFinisehdItemNotFound is the error returned when an item been marked as finished but not found in the state table + ErrFinisehdItemNotFound = errors.New("markAsFinished item not present in state table") +) diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go new file mode 100644 index 00000000..602bb434 --- /dev/null +++ b/internal/pkg/preprocessor/preprocessor.go @@ -0,0 +1,108 @@ +package preprocessor + +import ( + "context" + "log/slog" + "sync" + + "github.com/internetarchive/Zeno/pkg/models" +) + +type preprocessor struct { + wg sync.WaitGroup + ctx context.Context + cancel context.CancelFunc + input chan *models.Item + output chan *models.Item +} + +var ( + globalPreprocessor *preprocessor + once sync.Once +) + +// This functions starts the preprocessor responsible for preparing +// the seeds sent by the reactor for captures +func Start(inputChan, outputChan chan *models.Item) error { + var done bool + + once.Do(func() { + ctx, cancel := context.WithCancel(context.Background()) + globalPreprocessor = &preprocessor{ + ctx: ctx, + cancel: cancel, + input: inputChan, + output: outputChan, + } + globalPreprocessor.wg.Add(1) + go globalPreprocessor.run() + slog.Info("preprocessor started") + done = true + }) + + if !done { + return ErrPreprocessorAlreadyInitialized + } + + return nil +} + +func Stop() { + if globalPreprocessor != nil { + globalPreprocessor.cancel() + globalPreprocessor.wg.Wait() + close(globalPreprocessor.output) + slog.Info("preprocessor stopped") + } +} + +func (p *preprocessor) run() { + defer p.wg.Done() + + for { + select { + // Closes the run routine when context is canceled + case <-p.ctx.Done(): + slog.Info("preprocessor shutting down") + return + case item, ok := <-p.input: + if ok { + globalPreprocessor.wg.Add(1) + go p.preprocess(item) + } + } + } +} + +func (p *preprocessor) preprocess(item *models.Item) { + defer globalPreprocessor.wg.Done() + + // Validate the URL of either the item itself and/or its childs + var err error + if item.Status == models.ItemFresh { + // Preprocess the item's URL itself + item.URL.Value, err = validateURL(item.URL.Value, nil) + if err != nil { + slog.Warn("unable to validate URL", "url", item.URL.Value, "err", err.Error(), "func", "preprocessor.preprocess") + return + } + } else if len(item.Childs) > 0 { + // Preprocess the childs + for i := 0; i < len(item.Childs); { + child := item.Childs[i] + item.Childs[i].Value, err = validateURL(child.Value, item.URL) + if err != nil { + // If we can't validate an URL, we remove it from the list of childs + slog.Warn("unable to validate URL", "url", child.Value, "err", err.Error(), "func", "preprocessor.preprocess") + item.Childs = append(item.Childs[:i], item.Childs[i+1:]...) + } else { + i++ + } + } + } else { + slog.Error("item got into preprocessing without anything to preprocess") + } + + // Final step, send the preprocessed item to the output chan of the preprocessor + p.output <- item +} diff --git a/internal/pkg/preprocessor/url.go b/internal/pkg/preprocessor/url.go new file mode 100644 index 00000000..094a11bd --- /dev/null +++ b/internal/pkg/preprocessor/url.go @@ -0,0 +1,10 @@ +package preprocessor + +import ( + "github.com/internetarchive/gocrawlhq" +) + +func validateURL(URL string, parentURL *gocrawlhq.URL) (validatedURL string, err error) { + // Validate the URL, make it absolute if needed, etc. + return URL, err +} diff --git a/internal/pkg/reactor/reactor.go b/internal/pkg/reactor/reactor.go index 7ca8e6a7..2489c174 100644 --- a/internal/pkg/reactor/reactor.go +++ b/internal/pkg/reactor/reactor.go @@ -14,8 +14,8 @@ type reactor struct { tokenPool chan struct{} // Token pool to control asset count ctx context.Context // Context for stopping the reactor cancel context.CancelFunc // Context's cancel func - input chan *models.Seed // Combined input channel for source and feedback - output chan *models.Seed // Output channel + input chan *models.Item // Combined input channel for source and feedback + output chan *models.Item // Output channel stateTable sync.Map // State table for tracking seeds by UUID wg sync.WaitGroup // WaitGroup to manage goroutines } @@ -27,7 +27,7 @@ var ( // Start initializes the global reactor with the given maximum tokens. // This method can only be called once. -func Start(maxTokens int, outputChan chan *models.Seed) error { +func Start(maxTokens int, outputChan chan *models.Item) error { var done bool once.Do(func() { @@ -36,7 +36,7 @@ func Start(maxTokens int, outputChan chan *models.Seed) error { tokenPool: make(chan struct{}, maxTokens), ctx: ctx, cancel: cancel, - input: make(chan *models.Seed, maxTokens), + input: make(chan *models.Item, maxTokens), output: outputChan, } globalReactor.wg.Add(1) @@ -64,12 +64,12 @@ func Stop() { // ReceiveFeedback sends an item to the feedback channel. // If the item is not present on the state table it gets discarded -func ReceiveFeedback(item *models.Seed) error { +func ReceiveFeedback(item *models.Item) error { if globalReactor == nil { return ErrReactorNotInitialized } - item.Source = models.SeedSourceFeedback + item.Source = models.ItemSourceFeedback _, loaded := globalReactor.stateTable.Swap(item.UUID.String(), item) if !loaded { // An item sent to the feedback channel should be present on the state table, if not present reactor should error out @@ -84,16 +84,16 @@ func ReceiveFeedback(item *models.Seed) error { } // ReceiveInsert sends an item to the input channel consuming a token. -// It is the responsibility of the sender to set either SeedSourceQueue or SeedSourceHQ, if not set seed will get forced SeedSourceInsert -func ReceiveInsert(item *models.Seed) error { +// It is the responsibility of the sender to set either ItemSourceQueue or ItemSourceHQ, if not set seed will get forced ItemSourceInsert +func ReceiveInsert(item *models.Item) error { if globalReactor == nil { return ErrReactorNotInitialized } select { case globalReactor.tokenPool <- struct{}{}: - if item.Source != models.SeedSourceQueue && item.Source != models.SeedSourceHQ { - item.Source = models.SeedSourceInsert + if item.Source != models.ItemSourceQueue && item.Source != models.ItemSourceHQ { + item.Source = models.ItemSourceInsert } globalReactor.stateTable.Store(item.UUID.String(), item) globalReactor.input <- item @@ -104,7 +104,7 @@ func ReceiveInsert(item *models.Seed) error { } // MarkAsFinished marks an item as finished and releases a token if found in the state table. -func MarkAsFinished(item *models.Seed) error { +func MarkAsFinished(item *models.Item) error { if globalReactor == nil { return ErrReactorNotInitialized } diff --git a/internal/pkg/reactor/reactor_test.go b/internal/pkg/reactor/reactor_test.go index 79b2cefa..c72a6b0a 100644 --- a/internal/pkg/reactor/reactor_test.go +++ b/internal/pkg/reactor/reactor_test.go @@ -12,7 +12,7 @@ import ( func TestReactorE2E(t *testing.T) { // Initialize the reactor with a maximum of 5 tokens - outputChan := make(chan *models.Seed) + outputChan := make(chan *models.Item) err := Start(1, outputChan) if err != nil { t.Logf("Error starting reactor: %s", err) @@ -30,7 +30,7 @@ func TestReactorE2E(t *testing.T) { continue } // Send feedback for the consumed item - if item.Source != models.SeedSourceFeedback { + if item.Source != models.ItemSourceFeedback { err := ReceiveFeedback(item) if err != nil { t.Fatalf("Error sending feedback: %s - %s", err, item.UUID.String()) @@ -39,7 +39,7 @@ func TestReactorE2E(t *testing.T) { } // Mark the item as finished - if item.Source == models.SeedSourceFeedback { + if item.Source == models.ItemSourceFeedback { err := MarkAsFinished(item) if err != nil { t.Fatalf("Error marking item as finished: %s", err) @@ -52,19 +52,19 @@ func TestReactorE2E(t *testing.T) { } // Create mock seeds - mockSeeds := []*models.Seed{} + mockItems := []*models.Item{} for i := 0; i <= 1000; i++ { uuid := uuid.New() - mockSeeds = append(mockSeeds, &models.Seed{ + mockItems = append(mockItems, &models.Item{ UUID: &uuid, URL: &gocrawlhq.URL{Value: fmt.Sprintf("http://example.com/%d", i)}, - Status: models.SeedFresh, - Source: models.SeedSourceHQ, + Status: models.ItemFresh, + Source: models.ItemSourceHQ, }) } // Queue mock seeds to the source channel - for _, seed := range mockSeeds { + for _, seed := range mockItems { err := ReceiveInsert(seed) if err != nil { t.Fatalf("Error queuing seed to source channel: %s", err) diff --git a/main.go b/main.go index 2e86f043..4abeffe6 100644 --- a/main.go +++ b/main.go @@ -14,6 +14,10 @@ import ( "os" "github.com/internetarchive/Zeno/cmd" + "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/internetarchive/Zeno/internal/pkg/preprocessor" + "github.com/internetarchive/Zeno/internal/pkg/reactor" + "github.com/internetarchive/Zeno/pkg/models" ) func main() { @@ -22,5 +26,22 @@ func main() { os.Exit(1) } - slog.Info("and here we COOK") + fmt.Printf("%+v\n", config.Get()) + + // Start the reactor that will receive + reactorOutputChan := make(chan *models.Item) + err := reactor.Start(config.Get().WorkersCount, reactorOutputChan) + if err != nil { + slog.Error("error starting reactor", "err", err.Error()) + return + } + defer reactor.Stop() + + preprocessorOutputChan := make(chan *models.Item) + err = preprocessor.Start(reactorOutputChan, preprocessorOutputChan) + if err != nil { + slog.Error("error starting preprocessor", "err", err.Error()) + return + } + defer preprocessor.Stop() } diff --git a/pkg/models/seed.go b/pkg/models/seed.go index 3d7ff260..1eecbdcb 100644 --- a/pkg/models/seed.go +++ b/pkg/models/seed.go @@ -5,44 +5,44 @@ import ( "github.com/internetarchive/gocrawlhq" ) -// Seed represents a URL, it's assets and it's state in the pipeline -type Seed struct { - UUID *uuid.UUID // UUID is the unique identifier of the seed - URL *gocrawlhq.URL // URL is the URL of the seed - Status SeedState // Status is the state of the seed in the pipeline - Source SeedSource // Source is the source of the seed in the pipeline - AssetsCaptured bool // AssetsCaptured is the flag to indicate if the assets of the seed has been captured - Assets []*gocrawlhq.URL // Assets is the list of assets of the seed +// Item represents a URL, it's childs (e.g. discovered assets) and it's state in the pipeline +type Item struct { + UUID *uuid.UUID // UUID is the unique identifier of the item + URL *gocrawlhq.URL // URL is the URL of the item + Status ItemState // Status is the state of the item in the pipeline + Source ItemSource // Source is the source of the item in the pipeline + ChildsCaptured bool // ChildsCaptured is the flag to indicate if the child URLs of the item have been captured + Childs []*gocrawlhq.URL // Childs is the list of URLs that have been discovered via the item's URL } -// SeedState qualifies the state of a seed in the pipeline -type SeedState int +// ItemState qualifies the state of a item in the pipeline +type ItemState int const ( - // SeedFresh is the initial state of a seed either it's from HQ, the Queue or Feedback - SeedFresh SeedState = iota - // SeedPreProcessed is the state after the seed has been pre-processed - SeedPreProcessed - // SeedCaptured is the state after the seed has been captured - SeedCaptured - // SeedPostProcessed is the state after the seed has been post-processed - SeedPostProcessed - // SeedFailed is the state after the seed has failed - SeedFailed - // SeedCompleted is the state after the seed has been completed - SeedCompleted + // ItemFresh is the initial state of a item either it's from HQ, the Queue or Feedback + ItemFresh ItemState = iota + // ItemPreProcessed is the state after the item has been pre-processed + ItemPreProcessed + // ItemCaptured is the state after the item has been captured + ItemCaptured + // ItemPostProcessed is the state after the item has been post-processed + ItemPostProcessed + // ItemFailed is the state after the item has failed + ItemFailed + // ItemCompleted is the state after the item has been completed + ItemCompleted ) -// SeedSource qualifies the source of a seed in the pipeline -type SeedSource int +// ItemSource qualifies the source of a item in the pipeline +type ItemSource int const ( - // SeedSourceInsert is for seeds which source is not defined when inserted on reactor - SeedSourceInsert SeedSource = iota - // SeedSourceQueue is for seeds that are from the Queue - SeedSourceQueue - // SeedSourceHQ is for seeds that are from the HQ - SeedSourceHQ - // SeedSourceFeedback is for seeds that are from the Feedback - SeedSourceFeedback + // ItemSourceInsert is for items which source is not defined when inserted on reactor + ItemSourceInsert ItemSource = iota + // ItemSourceQueue is for items that are from the Queue + ItemSourceQueue + // ItemSourceHQ is for items that are from the HQ + ItemSourceHQ + // ItemSourceFeedback is for items that are from the Feedback + ItemSourceFeedback ) From f1966e4ee112f277889132c06a962f15833496ea Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Tue, 19 Nov 2024 13:36:34 +0100 Subject: [PATCH 015/295] add: goroutines limiter in preprocessor --- internal/pkg/preprocessor/preprocessor.go | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go index 602bb434..54e4eb4c 100644 --- a/internal/pkg/preprocessor/preprocessor.go +++ b/internal/pkg/preprocessor/preprocessor.go @@ -5,6 +5,7 @@ import ( "log/slog" "sync" + "github.com/internetarchive/Zeno/internal/pkg/config" "github.com/internetarchive/Zeno/pkg/models" ) @@ -59,6 +60,11 @@ func Stop() { func (p *preprocessor) run() { defer p.wg.Done() + var ( + wg sync.WaitGroup + guard = make(chan struct{}, config.Get().WorkersCount) + ) + for { select { // Closes the run routine when context is canceled @@ -67,16 +73,19 @@ func (p *preprocessor) run() { return case item, ok := <-p.input: if ok { - globalPreprocessor.wg.Add(1) - go p.preprocess(item) + guard <- struct{}{} + wg.Add(1) + go func() { + defer wg.Done() + defer func() { <-guard }() + p.preprocess(item) + }() } } } } func (p *preprocessor) preprocess(item *models.Item) { - defer globalPreprocessor.wg.Done() - // Validate the URL of either the item itself and/or its childs var err error if item.Status == models.ItemFresh { From de0fe224d1d7e638664a1ee6a55e6beb627818fb Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Tue, 19 Nov 2024 14:09:02 +0100 Subject: [PATCH 016/295] add: custom URL struct --- .old/internal/pkg/crawl/hq.go | 387 ---------------------- internal/pkg/config/config.go | 2 +- internal/pkg/preprocessor/url.go | 6 +- internal/pkg/reactor/reactor_test.go | 20 +- internal/pkg/source/hq.go | 384 +++++++++++++++++++++ internal/pkg/source/source.go | 7 + internal/pkg/{config => utils}/version.go | 4 +- pkg/models/seed.go | 13 +- pkg/models/url.go | 15 + 9 files changed, 432 insertions(+), 406 deletions(-) delete mode 100644 .old/internal/pkg/crawl/hq.go create mode 100644 internal/pkg/source/hq.go create mode 100644 internal/pkg/source/source.go rename internal/pkg/{config => utils}/version.go (94%) create mode 100644 pkg/models/url.go diff --git a/.old/internal/pkg/crawl/hq.go b/.old/internal/pkg/crawl/hq.go deleted file mode 100644 index 8a9b9649..00000000 --- a/.old/internal/pkg/crawl/hq.go +++ /dev/null @@ -1,387 +0,0 @@ -package crawl - -import ( - "math" - "net/url" - "strings" - "sync" - "time" - - "github.com/internetarchive/Zeno/internal/pkg/queue" - "github.com/internetarchive/Zeno/internal/pkg/utils" - "github.com/internetarchive/gocrawlhq" -) - -// This function connects to HQ's websocket and listen for messages. -// It also sends and "identify" message to the HQ to let it know that -// Zeno is connected. This "identify" message is sent every second and -// contains the crawler's stats and details. -func (c *Crawl) HQWebsocket() { - var ( - // the "identify" message will be sent every second - // to the crawl HQ - identifyTicker = time.NewTicker(time.Second) - ) - - defer func() { - identifyTicker.Stop() - }() - - // send an "identify" message to the crawl HQ every second - for { - err := c.HQClient.Identify(&gocrawlhq.IdentifyMessage{ - Project: c.HQProject, - Job: c.Job, - IP: utils.GetOutboundIP().String(), - Hostname: utils.GetHostname(), - GoVersion: utils.GetVersion().GoVersion, - }) - if err != nil { - c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error sending identify payload to crawl HQ, trying to reconnect..") - - err = c.HQClient.InitWebsocketConn() - if err != nil { - c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error initializing websocket connection to crawl HQ") - } - } - - <-identifyTicker.C - } -} - -func (c *Crawl) HQProducer() { - defer c.HQChannelsWg.Done() - - var ( - discoveredArray = []gocrawlhq.URL{} - mutex = sync.Mutex{} - terminateProducer = make(chan bool) - ) - - // the discoveredArray is sent to the crawl HQ every 10 seconds - // or when it reaches a certain size - go func() { - HQLastSent := time.Now() - - for { - select { - case <-terminateProducer: - // no need to lock the mutex here, because the producer channel - // is already closed, so no other goroutine can write to the slice - if len(discoveredArray) > 0 { - for { - err := c.HQClient.Add(discoveredArray, false) - if err != nil { - c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error sending payload to crawl HQ, waiting 1s then retrying..") - time.Sleep(time.Second) - continue - } - break - } - } - - return - default: - mutex.Lock() - if (len(discoveredArray) >= int(math.Ceil(float64(c.Workers.Count)/2)) || time.Since(HQLastSent) >= time.Second*10) && len(discoveredArray) > 0 { - for { - err := c.HQClient.Add(discoveredArray, false) - if err != nil { - c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error sending payload to crawl HQ, waiting 1s then retrying..") - time.Sleep(time.Second) - continue - } - break - } - - discoveredArray = []gocrawlhq.URL{} - HQLastSent = time.Now() - } - mutex.Unlock() - } - } - }() - - // listen to the discovered channel and add the URLs to the discoveredArray - for discoveredItem := range c.HQProducerChannel { - var via string - - if discoveredItem.ParentURL != nil { - via = utils.URLToString(discoveredItem.ParentURL) - } - - discoveredURL := gocrawlhq.URL{ - Value: utils.URLToString(discoveredItem.URL), - Via: via, - } - - for i := uint64(0); i < discoveredItem.Hop; i++ { - discoveredURL.Path += "L" - } - - // The reason we are using a string instead of a bool is because - // gob's encode/decode doesn't properly support booleans - if discoveredItem.BypassSeencheck { - for { - err := c.HQClient.Add([]gocrawlhq.URL{discoveredURL}, true) - if err != nil { - c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ - "bypassSeencheck": discoveredItem.BypassSeencheck, - })).Error("error sending payload to crawl HQ, waiting 1s then retrying..") - time.Sleep(time.Second) - continue - } - break - } - continue - } - - mutex.Lock() - discoveredArray = append(discoveredArray, discoveredURL) - mutex.Unlock() - } - - // if we are here, it means that the HQProducerChannel has been closed - // so we need to send the last payload to the crawl HQ - terminateProducer <- true -} - -func (c *Crawl) HQConsumer() { - for { - c.HQConsumerState = "running" - - // This is on purpose evaluated every time, - // because the value of workers will maybe change - // during the crawl in the future (to be implemented) - var HQBatchSize = int(c.Workers.Count) - - if c.Finished.Get() { - c.HQConsumerState = "finished" - c.Log.Error("crawl finished, stopping HQ consumer") - break - } - - // If HQContinuousPull is set to true, we will pull URLs from HQ continuously, - // otherwise we will only pull URLs when needed (and when the crawl is not paused) - for (c.Queue.GetStats().TotalElements > HQBatchSize && !c.HQContinuousPull) || c.Paused.Get() || c.Queue.HandoverOpen.Get() { - c.HQConsumerState = "waiting" - c.Log.Info("HQ producer waiting", "paused", c.Paused.Get(), "handoverOpen", c.Queue.HandoverOpen.Get(), "queueSize", c.Queue.GetStats().TotalElements) - time.Sleep(time.Millisecond * 50) - continue - } - - // If a specific HQ batch size is set, use it - if c.HQBatchSize != 0 { - HQBatchSize = c.HQBatchSize - } - - // get batch from crawl HQ - c.HQConsumerState = "waitingOnFeed" - var URLs []gocrawlhq.URL - var err error - if c.HQBatchConcurrency == 1 { - URLs, err = c.HQClient.Get(HQBatchSize, c.HQStrategy) - if err != nil { - // c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ - // "batchSize": HQBatchSize, - // "err": err, - // })).Debug("error getting new URLs from crawl HQ") - continue - } - } else { - var mu sync.Mutex - var wg sync.WaitGroup - batchSize := HQBatchSize / c.HQBatchConcurrency - URLsChan := make(chan []gocrawlhq.URL, c.HQBatchConcurrency) - - // Start goroutines to get URLs from crawl HQ, each will request - // HQBatchSize / HQConcurrentBatch URLs - for i := 0; i < c.HQBatchConcurrency; i++ { - wg.Add(1) - go func() { - defer wg.Done() - URLs, err := c.HQClient.Get(batchSize, c.HQStrategy) - if err != nil { - // c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ - // "batchSize": batchSize, - // "err": err, - // })).Debug("error getting new URLs from crawl HQ") - return - } - URLsChan <- URLs - }() - } - - // Wait for all goroutines to finish - go func() { - wg.Wait() - close(URLsChan) - }() - - // Collect all URLs from the channels - for URLsFromChan := range URLsChan { - mu.Lock() - URLs = append(URLs, URLsFromChan...) - mu.Unlock() - } - } - c.HQConsumerState = "feedCompleted" - - // send all URLs received in the batch to the queue - var items = make([]*queue.Item, 0, len(URLs)) - if len(URLs) > 0 { - for _, URL := range URLs { - c.HQConsumerState = "urlParse" - newURL, err := url.Parse(URL.Value) - if err != nil { - c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ - "url": URL.Value, - "batchSize": HQBatchSize, - "err": err, - })).Error("unable to parse URL received from crawl HQ, discarding") - continue - } - - c.HQConsumerState = "newItem" - newItem, err := queue.NewItem(newURL, nil, "seed", uint64(strings.Count(URL.Path, "L")), URL.ID, false) - if err != nil { - c.Log.WithFields(c.genLogFields(err, newURL, map[string]interface{}{ - "url": URL.Value, - "batchSize": HQBatchSize, - "err": err, - })).Error("unable to create new item from URL received from crawl HQ, discarding") - continue - } - - c.HQConsumerState = "append" - items = append(items, newItem) - } - } - - c.HQConsumerState = "enqueue" - err = c.Queue.BatchEnqueue(items...) - if err != nil { - c.Log.Error("unable to enqueue URL batch received from crawl HQ, discarding", "error", err) - continue - } - } -} - -func (c *Crawl) HQFinisher() { - defer c.HQChannelsWg.Done() - - var ( - finishedArray = []gocrawlhq.URL{} - locallyCrawledTotal int - ) - - for finishedItem := range c.HQFinishedChannel { - if finishedItem.ID == "" { - c.Log.WithFields(c.genLogFields(nil, finishedItem.URL, nil)).Warn("URL has no ID, discarding") - continue - } - - locallyCrawledTotal += int(finishedItem.LocallyCrawled) - finishedArray = append(finishedArray, gocrawlhq.URL{ID: finishedItem.ID, Value: utils.URLToString(finishedItem.URL)}) - - if len(finishedArray) == int(math.Ceil(float64(c.Workers.Count)/2)) { - for { - err := c.HQClient.Delete(finishedArray, locallyCrawledTotal) - if err != nil { - c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ - "finishedArray": finishedArray, - })).Error("error submitting finished urls to crawl HQ. retrying in one second...") - time.Sleep(time.Second) - continue - } - break - } - - finishedArray = []gocrawlhq.URL{} - locallyCrawledTotal = 0 - } - } - - // send remaining finished URLs - if len(finishedArray) > 0 { - for { - err := c.HQClient.Delete(finishedArray, locallyCrawledTotal) - if err != nil { - c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ - "finishedArray": finishedArray, - })).Error("error submitting finished urls to crawl HQ. retrying in one second...") - time.Sleep(time.Second) - continue - } - break - } - } -} - -func (c *Crawl) HQSeencheckURLs(URLs []*url.URL) (seencheckedBatch []*url.URL, err error) { - var ( - discoveredURLs []gocrawlhq.URL - ) - - for _, URL := range URLs { - discoveredURLs = append(discoveredURLs, gocrawlhq.URL{ - Value: utils.URLToString(URL), - Type: "asset", - }) - } - - outputURLs, err := c.HQClient.Seencheck(discoveredURLs) - if err != nil { - c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ - "batchLen": len(URLs), - "urls": discoveredURLs, - })).Error("error sending seencheck payload to crawl HQ") - return seencheckedBatch, err - } - - if outputURLs != nil { - for _, URL := range outputURLs { - // the returned payload only contain new URLs to be crawled by Zeno - newURL, err := url.Parse(URL.Value) - if err != nil { - c.Log.WithFields(c.genLogFields(err, URL, map[string]interface{}{ - "batchLen": len(URLs), - })).Error("error parsing URL from HQ seencheck response") - return seencheckedBatch, err - } - - seencheckedBatch = append(seencheckedBatch, newURL) - } - } - - return seencheckedBatch, nil -} - -// returns: -// - bool: true if the URL is new, false if it has been seen before -// - error: if there's an error sending the payload to crawl HQ -// -// NOTE: if there's an error, the URL is considered new -func (c *Crawl) HQSeencheckURL(URL *url.URL) (bool, error) { - discoveredURL := gocrawlhq.URL{ - Value: utils.URLToString(URL), - Type: "asset", - } - - outputURLs, err := c.HQClient.Seencheck([]gocrawlhq.URL{discoveredURL}) - if err != nil { - c.Log.Error("error sending seencheck payload to crawl HQ", "err", err, "url", utils.URLToString(URL)) - return true, err // return true, don't discard the URL if there's an error - } - - if outputURLs != nil { - for _, URL := range outputURLs { - // the returned payload only contain new URLs to be crawled by Zeno - if URL.Value == discoveredURL.Value { - return true, nil - } - } - } - - return false, nil -} diff --git a/internal/pkg/config/config.go b/internal/pkg/config/config.go index 16e0ae0b..07604e54 100644 --- a/internal/pkg/config/config.go +++ b/internal/pkg/config/config.go @@ -216,7 +216,7 @@ func GenerateCrawlConfig() error { } if config.UserAgent == "" { - version := getVersion() + version := utils.GetVersion() // If Version is a commit hash, we only take the first 7 characters if len(version.Version) >= 40 { diff --git a/internal/pkg/preprocessor/url.go b/internal/pkg/preprocessor/url.go index 094a11bd..f4cab0b8 100644 --- a/internal/pkg/preprocessor/url.go +++ b/internal/pkg/preprocessor/url.go @@ -1,10 +1,8 @@ package preprocessor -import ( - "github.com/internetarchive/gocrawlhq" -) +import "github.com/internetarchive/Zeno/pkg/models" -func validateURL(URL string, parentURL *gocrawlhq.URL) (validatedURL string, err error) { +func validateURL(URL string, parentURL *models.URL) (validatedURL string, err error) { // Validate the URL, make it absolute if needed, etc. return URL, err } diff --git a/internal/pkg/reactor/reactor_test.go b/internal/pkg/reactor/reactor_test.go index c72a6b0a..d455b322 100644 --- a/internal/pkg/reactor/reactor_test.go +++ b/internal/pkg/reactor/reactor_test.go @@ -20,9 +20,12 @@ func TestReactorE2E(t *testing.T) { } defer Stop() + // Channel to collect errors from goroutines + errorChan := make(chan error) + // Consume items from the output channel, start 5 goroutines for i := 0; i < 5; i++ { - go func(t *testing.T) { + go func() { for { select { case item := <-outputChan: @@ -33,7 +36,7 @@ func TestReactorE2E(t *testing.T) { if item.Source != models.ItemSourceFeedback { err := ReceiveFeedback(item) if err != nil { - t.Fatalf("Error sending feedback: %s - %s", err, item.UUID.String()) + errorChan <- fmt.Errorf("Error sending feedback: %s - %s", err, item.UUID.String()) } continue } @@ -42,22 +45,29 @@ func TestReactorE2E(t *testing.T) { if item.Source == models.ItemSourceFeedback { err := MarkAsFinished(item) if err != nil { - t.Fatalf("Error marking item as finished: %s", err) + errorChan <- fmt.Errorf("Error marking item as finished: %s", err) } continue } } } - }(t) + }() } + // Handle errors from goroutines + go func() { + for err := range errorChan { + t.Error(err) + } + }() + // Create mock seeds mockItems := []*models.Item{} for i := 0; i <= 1000; i++ { uuid := uuid.New() mockItems = append(mockItems, &models.Item{ UUID: &uuid, - URL: &gocrawlhq.URL{Value: fmt.Sprintf("http://example.com/%d", i)}, + URL: &models.URL{URL: gocrawlhq.URL{Value: fmt.Sprintf("http://example.com/%d", i)}}, Status: models.ItemFresh, Source: models.ItemSourceHQ, }) diff --git a/internal/pkg/source/hq.go b/internal/pkg/source/hq.go new file mode 100644 index 00000000..47c24f34 --- /dev/null +++ b/internal/pkg/source/hq.go @@ -0,0 +1,384 @@ +package source + +import ( + "log/slog" + "time" + + "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/internetarchive/Zeno/internal/pkg/utils" + "github.com/internetarchive/gocrawlhq" +) + +// This function connects to HQ's websocket and listen for messages. +// It also sends and "identify" message to the HQ to let it know that +// Zeno is connected. This "identify" message is sent every second and +// contains the crawler's stats and details. +func HQWebsocket() { + var ( + // the "identify" message will be sent every second + // to the crawl HQ + identifyTicker = time.NewTicker(time.Second) + ) + + defer func() { + identifyTicker.Stop() + }() + + // send an "identify" message to the crawl HQ every second + for { + err := HQClient.Identify(&gocrawlhq.IdentifyMessage{ + Project: config.Get().HQProject, + Job: config.Get().Job, + IP: utils.GetOutboundIP().String(), + Hostname: utils.GetHostname(), + GoVersion: utils.GetVersion().GoVersion, + }) + if err != nil { + slog.Error("error sending identify payload to Crawl HQ, trying to reconnect", "err", err.Error()) + + err = HQClient.InitWebsocketConn() + if err != nil { + slog.Error("error initializing websocket connection to crawl HQ", "err", err.Error()) + } + } + + <-identifyTicker.C + } +} + +// func HQProducer() { +// defer c.HQChannelsWg.Done() + +// var ( +// discoveredArray = []gocrawlhq.URL{} +// mutex = sync.Mutex{} +// terminateProducer = make(chan bool) +// ) + +// // the discoveredArray is sent to the crawl HQ every 10 seconds +// // or when it reaches a certain size +// go func() { +// HQLastSent := time.Now() + +// for { +// select { +// case <-terminateProducer: +// // no need to lock the mutex here, because the producer channel +// // is already closed, so no other goroutine can write to the slice +// if len(discoveredArray) > 0 { +// for { +// err := c.HQClient.Add(discoveredArray, false) +// if err != nil { +// c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error sending payload to crawl HQ, waiting 1s then retrying..") +// time.Sleep(time.Second) +// continue +// } +// break +// } +// } + +// return +// default: +// mutex.Lock() +// if (len(discoveredArray) >= int(math.Ceil(float64(c.Workers.Count)/2)) || time.Since(HQLastSent) >= time.Second*10) && len(discoveredArray) > 0 { +// for { +// err := c.HQClient.Add(discoveredArray, false) +// if err != nil { +// c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error sending payload to crawl HQ, waiting 1s then retrying..") +// time.Sleep(time.Second) +// continue +// } +// break +// } + +// discoveredArray = []gocrawlhq.URL{} +// HQLastSent = time.Now() +// } +// mutex.Unlock() +// } +// } +// }() + +// // listen to the discovered channel and add the URLs to the discoveredArray +// for discoveredItem := range c.HQProducerChannel { +// var via string + +// if discoveredItem.ParentURL != nil { +// via = utils.URLToString(discoveredItem.ParentURL) +// } + +// discoveredURL := gocrawlhq.URL{ +// Value: utils.URLToString(discoveredItem.URL), +// Via: via, +// } + +// for i := uint64(0); i < discoveredItem.Hop; i++ { +// discoveredURL.Path += "L" +// } + +// // The reason we are using a string instead of a bool is because +// // gob's encode/decode doesn't properly support booleans +// if discoveredItem.BypassSeencheck { +// for { +// err := c.HQClient.Add([]gocrawlhq.URL{discoveredURL}, true) +// if err != nil { +// c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ +// "bypassSeencheck": discoveredItem.BypassSeencheck, +// })).Error("error sending payload to crawl HQ, waiting 1s then retrying..") +// time.Sleep(time.Second) +// continue +// } +// break +// } +// continue +// } + +// mutex.Lock() +// discoveredArray = append(discoveredArray, discoveredURL) +// mutex.Unlock() +// } + +// // if we are here, it means that the HQProducerChannel has been closed +// // so we need to send the last payload to the crawl HQ +// terminateProducer <- true +// } + +// func HQConsumer() { +// for { +// c.HQConsumerState = "running" + +// // This is on purpose evaluated every time, +// // because the value of workers will maybe change +// // during the crawl in the future (to be implemented) +// var HQBatchSize = int(c.Workers.Count) + +// if c.Finished.Get() { +// c.HQConsumerState = "finished" +// c.Log.Error("crawl finished, stopping HQ consumer") +// break +// } + +// // If HQContinuousPull is set to true, we will pull URLs from HQ continuously, +// // otherwise we will only pull URLs when needed (and when the crawl is not paused) +// for (c.Queue.GetStats().TotalElements > HQBatchSize && !c.HQContinuousPull) || c.Paused.Get() || c.Queue.HandoverOpen.Get() { +// c.HQConsumerState = "waiting" +// c.Log.Info("HQ producer waiting", "paused", c.Paused.Get(), "handoverOpen", c.Queue.HandoverOpen.Get(), "queueSize", c.Queue.GetStats().TotalElements) +// time.Sleep(time.Millisecond * 50) +// continue +// } + +// // If a specific HQ batch size is set, use it +// if c.HQBatchSize != 0 { +// HQBatchSize = c.HQBatchSize +// } + +// // get batch from crawl HQ +// c.HQConsumerState = "waitingOnFeed" +// var URLs []gocrawlhq.URL +// var err error +// if c.HQBatchConcurrency == 1 { +// URLs, err = c.HQClient.Get(HQBatchSize, c.HQStrategy) +// if err != nil { +// // c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ +// // "batchSize": HQBatchSize, +// // "err": err, +// // })).Debug("error getting new URLs from crawl HQ") +// continue +// } +// } else { +// var mu sync.Mutex +// var wg sync.WaitGroup +// batchSize := HQBatchSize / c.HQBatchConcurrency +// URLsChan := make(chan []gocrawlhq.URL, c.HQBatchConcurrency) + +// // Start goroutines to get URLs from crawl HQ, each will request +// // HQBatchSize / HQConcurrentBatch URLs +// for i := 0; i < c.HQBatchConcurrency; i++ { +// wg.Add(1) +// go func() { +// defer wg.Done() +// URLs, err := c.HQClient.Get(batchSize, c.HQStrategy) +// if err != nil { +// // c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ +// // "batchSize": batchSize, +// // "err": err, +// // })).Debug("error getting new URLs from crawl HQ") +// return +// } +// URLsChan <- URLs +// }() +// } + +// // Wait for all goroutines to finish +// go func() { +// wg.Wait() +// close(URLsChan) +// }() + +// // Collect all URLs from the channels +// for URLsFromChan := range URLsChan { +// mu.Lock() +// URLs = append(URLs, URLsFromChan...) +// mu.Unlock() +// } +// } +// c.HQConsumerState = "feedCompleted" + +// // send all URLs received in the batch to the queue +// var items = make([]*queue.Item, 0, len(URLs)) +// if len(URLs) > 0 { +// for _, URL := range URLs { +// c.HQConsumerState = "urlParse" +// newURL, err := url.Parse(URL.Value) +// if err != nil { +// c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ +// "url": URL.Value, +// "batchSize": HQBatchSize, +// "err": err, +// })).Error("unable to parse URL received from crawl HQ, discarding") +// continue +// } + +// c.HQConsumerState = "newItem" +// newItem, err := queue.NewItem(newURL, nil, "seed", uint64(strings.Count(URL.Path, "L")), URL.ID, false) +// if err != nil { +// c.Log.WithFields(c.genLogFields(err, newURL, map[string]interface{}{ +// "url": URL.Value, +// "batchSize": HQBatchSize, +// "err": err, +// })).Error("unable to create new item from URL received from crawl HQ, discarding") +// continue +// } + +// c.HQConsumerState = "append" +// items = append(items, newItem) +// } +// } + +// c.HQConsumerState = "enqueue" +// err = c.Queue.BatchEnqueue(items...) +// if err != nil { +// c.Log.Error("unable to enqueue URL batch received from crawl HQ, discarding", "error", err) +// continue +// } +// } +// } + +// func HQFinisher() { +// defer c.HQChannelsWg.Done() + +// var ( +// finishedArray = []gocrawlhq.URL{} +// locallyCrawledTotal int +// ) + +// for finishedItem := range c.HQFinishedChannel { +// if finishedItem.ID == "" { +// c.Log.WithFields(c.genLogFields(nil, finishedItem.URL, nil)).Warn("URL has no ID, discarding") +// continue +// } + +// locallyCrawledTotal += int(finishedItem.LocallyCrawled) +// finishedArray = append(finishedArray, gocrawlhq.URL{ID: finishedItem.ID, Value: utils.URLToString(finishedItem.URL)}) + +// if len(finishedArray) == int(math.Ceil(float64(c.Workers.Count)/2)) { +// for { +// err := c.HQClient.Delete(finishedArray, locallyCrawledTotal) +// if err != nil { +// c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ +// "finishedArray": finishedArray, +// })).Error("error submitting finished urls to crawl HQ. retrying in one second...") +// time.Sleep(time.Second) +// continue +// } +// break +// } + +// finishedArray = []gocrawlhq.URL{} +// locallyCrawledTotal = 0 +// } +// } + +// // send remaining finished URLs +// if len(finishedArray) > 0 { +// for { +// err := c.HQClient.Delete(finishedArray, locallyCrawledTotal) +// if err != nil { +// c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ +// "finishedArray": finishedArray, +// })).Error("error submitting finished urls to crawl HQ. retrying in one second...") +// time.Sleep(time.Second) +// continue +// } +// break +// } +// } +// } + +// func HQSeencheckURLs(URLs []*url.URL) (seencheckedBatch []*url.URL, err error) { +// var ( +// discoveredURLs []gocrawlhq.URL +// ) + +// for _, URL := range URLs { +// discoveredURLs = append(discoveredURLs, gocrawlhq.URL{ +// Value: utils.URLToString(URL), +// Type: "asset", +// }) +// } + +// outputURLs, err := c.HQClient.Seencheck(discoveredURLs) +// if err != nil { +// c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ +// "batchLen": len(URLs), +// "urls": discoveredURLs, +// })).Error("error sending seencheck payload to crawl HQ") +// return seencheckedBatch, err +// } + +// if outputURLs != nil { +// for _, URL := range outputURLs { +// // the returned payload only contain new URLs to be crawled by Zeno +// newURL, err := url.Parse(URL.Value) +// if err != nil { +// c.Log.WithFields(c.genLogFields(err, URL, map[string]interface{}{ +// "batchLen": len(URLs), +// })).Error("error parsing URL from HQ seencheck response") +// return seencheckedBatch, err +// } + +// seencheckedBatch = append(seencheckedBatch, newURL) +// } +// } + +// return seencheckedBatch, nil +// } + +// returns: +// - bool: true if the URL is new, false if it has been seen before +// - error: if there's an error sending the payload to crawl HQ +// +// NOTE: if there's an error, the URL is considered new +// func HQSeencheckURL(URL *url.URL) (bool, error) { +// discoveredURL := gocrawlhq.URL{ +// Value: utils.URLToString(URL), +// Type: "asset", +// } + +// outputURLs, err := HQClient.Seencheck([]gocrawlhq.URL{discoveredURL}) +// if err != nil { +// Log.Error("error sending seencheck payload to crawl HQ", "err", err, "url", utils.URLToString(URL)) +// return true, err // return true, don't discard the URL if there's an error +// } + +// if outputURLs != nil { +// for _, URL := range outputURLs { +// // the returned payload only contain new URLs to be crawled by Zeno +// if URL.Value == discoveredURL.Value { +// return true, nil +// } +// } +// } + +// return false, nil +// } diff --git a/internal/pkg/source/source.go b/internal/pkg/source/source.go new file mode 100644 index 00000000..beb16e87 --- /dev/null +++ b/internal/pkg/source/source.go @@ -0,0 +1,7 @@ +package source + +import "github.com/internetarchive/gocrawlhq" + +var ( + HQClient *gocrawlhq.Client +) diff --git a/internal/pkg/config/version.go b/internal/pkg/utils/version.go similarity index 94% rename from internal/pkg/config/version.go rename to internal/pkg/utils/version.go index 8dd16d1c..89ba3a84 100644 --- a/internal/pkg/config/version.go +++ b/internal/pkg/utils/version.go @@ -1,4 +1,4 @@ -package config +package utils import ( "runtime/debug" @@ -11,7 +11,7 @@ type Version struct { ZenoVersion string } -func getVersion() (version Version) { +func GetVersion() (version Version) { // Defaults to "unknown_version" version.Version = "unknown_version" diff --git a/pkg/models/seed.go b/pkg/models/seed.go index 1eecbdcb..8f6a7a8d 100644 --- a/pkg/models/seed.go +++ b/pkg/models/seed.go @@ -2,17 +2,16 @@ package models import ( "github.com/google/uuid" - "github.com/internetarchive/gocrawlhq" ) // Item represents a URL, it's childs (e.g. discovered assets) and it's state in the pipeline type Item struct { - UUID *uuid.UUID // UUID is the unique identifier of the item - URL *gocrawlhq.URL // URL is the URL of the item - Status ItemState // Status is the state of the item in the pipeline - Source ItemSource // Source is the source of the item in the pipeline - ChildsCaptured bool // ChildsCaptured is the flag to indicate if the child URLs of the item have been captured - Childs []*gocrawlhq.URL // Childs is the list of URLs that have been discovered via the item's URL + UUID *uuid.UUID // UUID is the unique identifier of the item + URL *URL // URL is the URL of the item + Status ItemState // Status is the state of the item in the pipeline + Source ItemSource // Source is the source of the item in the pipeline + ChildsCaptured bool // ChildsCaptured is the flag to indicate if the child URLs of the item have been captured + Childs []*URL // Childs is the list of URLs that have been discovered via the item's URL } // ItemState qualifies the state of a item in the pipeline diff --git a/pkg/models/url.go b/pkg/models/url.go new file mode 100644 index 00000000..121c08f2 --- /dev/null +++ b/pkg/models/url.go @@ -0,0 +1,15 @@ +package models + +import ( + "net/url" + + "github.com/internetarchive/gocrawlhq" +) + +type URL struct { + gocrawlhq.URL +} + +func (u *URL) Parsed() (URL *url.URL, err error) { + return url.Parse(u.Value) +} From 6bd937468682d6397967c1d1f5f6bad69ce01708 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Tue, 19 Nov 2024 14:17:50 +0100 Subject: [PATCH 017/295] add logging package --- go.mod | 2 + go.sum | 4 + internal/pkg/log/config.go | 46 ++++++++++ internal/pkg/log/destinations.go | 42 ++++++++++ internal/pkg/log/elastic_destination.go | 106 ++++++++++++++++++++++++ internal/pkg/log/fielded_logger.go | 56 +++++++++++++ internal/pkg/log/file_destination.go | 89 ++++++++++++++++++++ internal/pkg/log/log.go | 71 ++++++++++++++++ internal/pkg/log/logger.go | 88 ++++++++++++++++++++ internal/pkg/log/std_destination.go | 49 +++++++++++ internal/pkg/reactor/reactor.go | 16 +++- main.go | 17 +++- 12 files changed, 579 insertions(+), 7 deletions(-) create mode 100644 internal/pkg/log/config.go create mode 100644 internal/pkg/log/destinations.go create mode 100644 internal/pkg/log/elastic_destination.go create mode 100644 internal/pkg/log/fielded_logger.go create mode 100644 internal/pkg/log/file_destination.go create mode 100644 internal/pkg/log/log.go create mode 100644 internal/pkg/log/logger.go create mode 100644 internal/pkg/log/std_destination.go diff --git a/go.mod b/go.mod index 4cf1333c..6dcb22b9 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,8 @@ go 1.23.3 require ( github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 + github.com/elastic/go-elasticsearch v0.0.0 + github.com/elastic/go-elasticsearch/v7 v7.17.10 github.com/google/uuid v1.6.0 github.com/internetarchive/gocrawlhq v1.2.20 github.com/spf13/cobra v1.8.1 diff --git a/go.sum b/go.sum index 3cf46f98..5de95c8c 100644 --- a/go.sum +++ b/go.sum @@ -5,6 +5,10 @@ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/elastic/go-elasticsearch v0.0.0 h1:Pd5fqOuBxKxv83b0+xOAJDAkziWYwFinWnBO0y+TZaA= +github.com/elastic/go-elasticsearch v0.0.0/go.mod h1:TkBSJBuTyFdBnrNqoPc54FN0vKf5c04IdM4zuStJ7xg= +github.com/elastic/go-elasticsearch/v7 v7.17.10 h1:TCQ8i4PmIJuBunvBS6bwT2ybzVFxxUhhltAs3Gyu1yo= +github.com/elastic/go-elasticsearch/v7 v7.17.10/go.mod h1:OJ4wdbtDNk5g503kvlHLyErCgQwwzmDtaFC4XyOxXA4= github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= diff --git a/internal/pkg/log/config.go b/internal/pkg/log/config.go new file mode 100644 index 00000000..10f29f5c --- /dev/null +++ b/internal/pkg/log/config.go @@ -0,0 +1,46 @@ +// config.go +package log + +import ( + "log/slog" + "time" +) + +// Config defines the configuration for the logging package +type Config struct { + FileConfig *LogfileConfig + StdoutEnabled bool + StdoutLevel slog.Level + StderrEnabled bool + StderrLevel slog.Level + RotateLogFile bool + RotatePeriod time.Duration + ElasticsearchConfig *ElasticsearchConfig + RotateElasticSearchIndex bool +} + +// LogfileConfig defines the configuration for file logging +type LogfileConfig struct { + Dir string + Prefix string + Level slog.Level +} + +// ElasticsearchConfig defines the configuration for Elasticsearch logging +type ElasticsearchConfig struct { + Addresses []string + Username string + Password string + IndexPrefix string + Level slog.Level +} + +// defaultConfig returns the default configuration +func defaultConfig() *Config { + return &Config{ + StdoutEnabled: true, + StdoutLevel: slog.LevelInfo, + StderrEnabled: true, + StderrLevel: slog.LevelError, + } +} diff --git a/internal/pkg/log/destinations.go b/internal/pkg/log/destinations.go new file mode 100644 index 00000000..05de1909 --- /dev/null +++ b/internal/pkg/log/destinations.go @@ -0,0 +1,42 @@ +// destination.go +package log + +import ( + "log/slog" +) + +// Destination interface +type Destination interface { + Enabled() bool + Level() slog.Level + Write(entry *logEntry) + Close() +} + +func initDestinations() []Destination { + var destinations []Destination + + if config.StdoutEnabled { + destinations = append(destinations, &StdoutDestination{ + level: config.StdoutLevel, + }) + } + + if config.StderrEnabled { + destinations = append(destinations, &StderrDestination{ + level: config.StderrLevel, + }) + } + + if config.FileConfig != nil { + fileDest := NewFileDestination(config.FileConfig) + destinations = append(destinations, fileDest) + } + + if config.ElasticsearchConfig != nil { + esDest := NewElasticsearchDestination(config.ElasticsearchConfig) + destinations = append(destinations, esDest) + } + + return destinations +} diff --git a/internal/pkg/log/elastic_destination.go b/internal/pkg/log/elastic_destination.go new file mode 100644 index 00000000..a302a33e --- /dev/null +++ b/internal/pkg/log/elastic_destination.go @@ -0,0 +1,106 @@ +package log + +import ( + "bytes" + "context" + "encoding/json" + "log/slog" + "sync" + "time" + + "github.com/elastic/go-elasticsearch/esapi" + elastic "github.com/elastic/go-elasticsearch/v7" +) + +// ElasticsearchDestination logs to Elasticsearch +type ElasticsearchDestination struct { + level slog.Level + config *ElasticsearchConfig + client *elastic.Client + index string + mu sync.Mutex + ticker *time.Ticker + closeChan chan struct{} +} + +func NewElasticsearchDestination(cfg *ElasticsearchConfig) *ElasticsearchDestination { + es, err := elastic.NewClient(elastic.Config{ + Addresses: cfg.Addresses, + Username: cfg.Username, + Password: cfg.Password, + }) + if err != nil { + // Handle error (for simplicity, we'll just ignore it here) + return nil + } + + ed := &ElasticsearchDestination{ + level: cfg.Level, + config: cfg, + client: es, + index: cfg.IndexPrefix + "-" + time.Now().Format("2006.01.02"), + closeChan: make(chan struct{}), + } + + if config.RotateElasticSearchIndex && config.RotatePeriod > 0 { + ed.ticker = time.NewTicker(config.RotatePeriod) + go ed.rotationWorker() + } + + return ed +} + +func (d *ElasticsearchDestination) Enabled() bool { + return d.client != nil +} + +func (d *ElasticsearchDestination) Level() slog.Level { + return d.level +} + +func (d *ElasticsearchDestination) Write(entry *logEntry) { + doc := map[string]interface{}{ + "timestamp": time.Now().Format(time.RFC3339), + "level": entry.level.String(), + "message": entry.msg, + "fields": entry.args, + } + + var buf bytes.Buffer + if err := json.NewEncoder(&buf).Encode(doc); err != nil { + // Handle error + return + } + + req := esapi.IndexRequest{ + Index: d.index, + DocumentID: "", // Auto-generate ID + Body: &buf, + Refresh: "true", + } + + _, err := req.Do(context.Background(), d.client) + if err != nil { + // Handle error + } +} + +func (d *ElasticsearchDestination) Close() { + if d.ticker != nil { + d.ticker.Stop() + } + close(d.closeChan) +} + +func (d *ElasticsearchDestination) rotationWorker() { + for { + select { + case <-d.ticker.C: + d.mu.Lock() + d.index = d.config.IndexPrefix + "-" + time.Now().Format("2006.01.02") + d.mu.Unlock() + case <-d.closeChan: + return + } + } +} diff --git a/internal/pkg/log/fielded_logger.go b/internal/pkg/log/fielded_logger.go new file mode 100644 index 00000000..6cb83d22 --- /dev/null +++ b/internal/pkg/log/fielded_logger.go @@ -0,0 +1,56 @@ +package log + +import ( + "log/slog" +) + +// Field defines an interface for fields +type Fields map[string]interface{} + +// FieldedLogger allows adding predefined fields to log entries +type FieldedLogger struct { + fields *Fields +} + +// NewFieldedLogger creates a new FieldedLogger with the given fields +func NewFieldedLogger(args *Fields) *FieldedLogger { + return &FieldedLogger{ + fields: args, + } +} + +// FieldedLogger methods +func (fl *FieldedLogger) Debug(msg string, args ...any) { + fl.logWithLevel(slog.LevelDebug, msg, args...) +} + +func (fl *FieldedLogger) Info(msg string, args ...any) { + fl.logWithLevel(slog.LevelInfo, msg, args...) +} + +func (fl *FieldedLogger) Warn(msg string, args ...any) { + fl.logWithLevel(slog.LevelWarn, msg, args...) +} + +func (fl *FieldedLogger) Error(msg string, args ...any) { + fl.logWithLevel(slog.LevelError, msg, args...) +} + +func (fl *FieldedLogger) logWithLevel(level slog.Level, msg string, args ...any) { + var combinedArgs []any + + if fl.fields != nil { + for k, v := range *fl.fields { + combinedArgs = append(combinedArgs, k) + combinedArgs = append(combinedArgs, v) + } + } + + if len(args) > 0 { + for _, arg := range args { + combinedArgs = append(combinedArgs, arg) + } + } + + logWithLevel(level, msg, combinedArgs...) +} diff --git a/internal/pkg/log/file_destination.go b/internal/pkg/log/file_destination.go new file mode 100644 index 00000000..e5e239b2 --- /dev/null +++ b/internal/pkg/log/file_destination.go @@ -0,0 +1,89 @@ +package log + +import ( + "fmt" + "log/slog" + "os" + "sync" + "time" +) + +// FileDestination logs to a file with rotation +type FileDestination struct { + level slog.Level + config *LogfileConfig + file *os.File + mu sync.Mutex + ticker *time.Ticker + closeChan chan struct{} +} + +func NewFileDestination(cfg *LogfileConfig) *FileDestination { + fd := &FileDestination{ + level: cfg.Level, + config: cfg, + closeChan: make(chan struct{}), + } + + fd.rotateFile() + if config.RotateLogFile && config.RotatePeriod > 0 { + fd.ticker = time.NewTicker(config.RotatePeriod) + go fd.rotationWorker() + } + + return fd +} + +func (d *FileDestination) Enabled() bool { + return true +} + +func (d *FileDestination) Level() slog.Level { + return d.level +} + +func (d *FileDestination) Write(entry *logEntry) { + d.mu.Lock() + defer d.mu.Unlock() + if d.file != nil { + fmt.Fprintln(d.file, formatLogEntry(entry)) + } +} + +func (d *FileDestination) Close() { + if d.ticker != nil { + d.ticker.Stop() + } + close(d.closeChan) + d.mu.Lock() + if d.file != nil { + d.file.Close() + } + d.mu.Unlock() +} + +func (d *FileDestination) rotateFile() { + d.mu.Lock() + defer d.mu.Unlock() + if d.file != nil { + d.file.Close() + } + filename := fmt.Sprintf("%s/%s-%s.log", d.config.Dir, d.config.Prefix, time.Now().Format("2006.01.02T15-04")) + file, err := os.OpenFile(filename, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644) + if err != nil { + // Handle error (for simplicity, we'll just ignore it here) + return + } + d.file = file +} + +func (d *FileDestination) rotationWorker() { + for { + select { + case <-d.ticker.C: + d.rotateFile() + case <-d.closeChan: + return + } + } +} diff --git a/internal/pkg/log/log.go b/internal/pkg/log/log.go new file mode 100644 index 00000000..307be538 --- /dev/null +++ b/internal/pkg/log/log.go @@ -0,0 +1,71 @@ +// log.go +package log + +import ( + "context" + "log/slog" + "sync" + "time" +) + +// Global variables +var ( + config *Config + logQueue chan *logEntry + once sync.Once + wg sync.WaitGroup + cancelFunc context.CancelFunc +) + +// Init initializes the logging package with the given configuration. +// If no configuration is provided, it uses the default configuration. +func Init(cfgs ...*Config) { + once.Do(func() { + if len(cfgs) > 0 && cfgs[0] != nil { + config = cfgs[0] + } else { + config = defaultConfig() + } + setupLogger() + }) +} + +// Public logging methods +func Debug(msg string, args ...any) { + logWithLevel(slog.LevelDebug, msg, args...) +} + +func Info(msg string, args ...any) { + logWithLevel(slog.LevelInfo, msg, args...) +} + +func Warn(msg string, args ...any) { + logWithLevel(slog.LevelWarn, msg, args...) +} + +func Error(msg string, args ...any) { + logWithLevel(slog.LevelError, msg, args...) +} + +// logWithLevel sends the log entry to the logQueue +func logWithLevel(level slog.Level, msg string, args ...any) { + entry := &logEntry{ + timestamp: time.Now(), + level: level, + msg: msg, + args: args, + } + select { + case logQueue <- entry: + default: + slog.Error("Log queue is full, dropping log entry from logger", "msg", msg, "args", args) + } +} + +// Shutdown gracefully shuts down the logging system +func Shutdown() { + if cancelFunc != nil { + cancelFunc() + } + wg.Wait() +} diff --git a/internal/pkg/log/logger.go b/internal/pkg/log/logger.go new file mode 100644 index 00000000..cac423c5 --- /dev/null +++ b/internal/pkg/log/logger.go @@ -0,0 +1,88 @@ +// logger.go +package log + +import ( + "context" + "fmt" + "log/slog" + "strings" + "time" +) + +type logEntry struct { + timestamp time.Time + level slog.Level + msg string + args []any +} + +func setupLogger() { + // Initialize the log queue + logQueue = make(chan *logEntry, 1000) + + // Create a cancellable context + var ctx context.Context + ctx, cancelFunc = context.WithCancel(context.Background()) + + // Start the log processing goroutine + wg.Add(1) + go processLogQueue(ctx) +} + +func processLogQueue(ctx context.Context) { + defer wg.Done() + + // Initialize log destinations + destinations := initDestinations() + + for { + select { + case entry := <-logQueue: + // Process the log entry + for _, dest := range destinations { + if dest.Enabled() && entry.level >= dest.Level() { + dest.Write(entry) + } + } + case <-ctx.Done(): + // Drain the log queue before exiting + for len(logQueue) > 0 { + entry := <-logQueue + for _, dest := range destinations { + if dest.Enabled() && entry.level >= dest.Level() { + dest.Write(entry) + } + } + } + // Close destinations + for _, dest := range destinations { + dest.Close() + } + return + } + } +} + +// Helper function to format args +func formatArgs(args []any) string { + var sb strings.Builder + + for i := 0; i < len(args); i += 2 { + if i+1 < len(args) { + sb.WriteString(fmt.Sprintf("%v=%v", args[i], args[i+1])) + } else { + sb.WriteString(fmt.Sprintf("%v", args[i])) + } + if i+2 < len(args) { + sb.WriteString(" ") + } + } + + return sb.String() +} + +// Helper function to format log entries +func formatLogEntry(entry *logEntry) string { + + return fmt.Sprintf("%s [%s] %s %s", entry.timestamp.Format(time.RFC3339), entry.level.String(), entry.msg, formatArgs(entry.args)) +} diff --git a/internal/pkg/log/std_destination.go b/internal/pkg/log/std_destination.go new file mode 100644 index 00000000..bc5e9593 --- /dev/null +++ b/internal/pkg/log/std_destination.go @@ -0,0 +1,49 @@ +package log + +import ( + "fmt" + "log/slog" + "os" +) + +// StdoutDestination logs to stdout +type StdoutDestination struct { + level slog.Level +} + +func (d *StdoutDestination) Enabled() bool { + return true +} + +func (d *StdoutDestination) Level() slog.Level { + return d.level +} + +func (d *StdoutDestination) Write(entry *logEntry) { + if entry.level < config.StderrLevel || !config.StderrEnabled { + fmt.Println(formatLogEntry(entry)) + } +} + +func (d *StdoutDestination) Close() {} + +// StderrDestination logs to stderr +type StderrDestination struct { + level slog.Level +} + +func (d *StderrDestination) Enabled() bool { + return true +} + +func (d *StderrDestination) Level() slog.Level { + return d.level +} + +func (d *StderrDestination) Write(entry *logEntry) { + if entry.level >= config.StderrLevel { + fmt.Fprintln(os.Stderr, formatLogEntry(entry)) + } +} + +func (d *StderrDestination) Close() {} diff --git a/internal/pkg/reactor/reactor.go b/internal/pkg/reactor/reactor.go index 2489c174..db4c296c 100644 --- a/internal/pkg/reactor/reactor.go +++ b/internal/pkg/reactor/reactor.go @@ -3,9 +3,9 @@ package reactor import ( "context" - "log/slog" "sync" + "github.com/internetarchive/Zeno/internal/pkg/log" "github.com/internetarchive/Zeno/pkg/models" ) @@ -23,8 +23,16 @@ type reactor struct { var ( globalReactor *reactor once sync.Once + logger *log.FieldedLogger ) +func init() { + log.Init() + logger = log.NewFieldedLogger(&log.Fields{ + "component": "reactor", + }) +} + // Start initializes the global reactor with the given maximum tokens. // This method can only be called once. func Start(maxTokens int, outputChan chan *models.Item) error { @@ -41,7 +49,7 @@ func Start(maxTokens int, outputChan chan *models.Item) error { } globalReactor.wg.Add(1) go globalReactor.run() - slog.Info("reactor started") + logger.Info("started") done = true }) @@ -58,7 +66,7 @@ func Stop() { globalReactor.cancel() globalReactor.wg.Wait() close(globalReactor.output) - slog.Info("reactor stopped") + logger.Info("stopped") } } @@ -123,7 +131,7 @@ func (r *reactor) run() { select { // Closes the run routine when context is canceled case <-r.ctx.Done(): - slog.Info("reactor shutting down") + logger.Info("shutting down") return // Feeds items to the output channel diff --git a/main.go b/main.go index 4abeffe6..f268c106 100644 --- a/main.go +++ b/main.go @@ -10,16 +10,27 @@ package main import ( "fmt" - "log/slog" "os" "github.com/internetarchive/Zeno/cmd" "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/internetarchive/Zeno/internal/pkg/log" "github.com/internetarchive/Zeno/internal/pkg/preprocessor" "github.com/internetarchive/Zeno/internal/pkg/reactor" "github.com/internetarchive/Zeno/pkg/models" ) +var ( + logger *log.FieldedLogger +) + +func init() { + log.Init() + logger = log.NewFieldedLogger(&log.Fields{ + "component": "main", + }) +} + func main() { if err := cmd.Run(); err != nil { fmt.Println(err) @@ -32,7 +43,7 @@ func main() { reactorOutputChan := make(chan *models.Item) err := reactor.Start(config.Get().WorkersCount, reactorOutputChan) if err != nil { - slog.Error("error starting reactor", "err", err.Error()) + logger.Error("error starting reactor", "err", err.Error()) return } defer reactor.Stop() @@ -40,7 +51,7 @@ func main() { preprocessorOutputChan := make(chan *models.Item) err = preprocessor.Start(reactorOutputChan, preprocessorOutputChan) if err != nil { - slog.Error("error starting preprocessor", "err", err.Error()) + logger.Error("error starting preprocessor", "err", err.Error()) return } defer preprocessor.Stop() From 889ea614c1eaf2b5db5e4ca85e1dfda289fa4ff0 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Tue, 19 Nov 2024 14:23:46 +0100 Subject: [PATCH 018/295] replace preprocessor logging --- internal/pkg/preprocessor/preprocessor.go | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go index 54e4eb4c..19530267 100644 --- a/internal/pkg/preprocessor/preprocessor.go +++ b/internal/pkg/preprocessor/preprocessor.go @@ -2,10 +2,10 @@ package preprocessor import ( "context" - "log/slog" "sync" "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/internetarchive/Zeno/internal/pkg/log" "github.com/internetarchive/Zeno/pkg/models" ) @@ -20,8 +20,16 @@ type preprocessor struct { var ( globalPreprocessor *preprocessor once sync.Once + logger *log.FieldedLogger ) +func init() { + log.Init() + logger = log.NewFieldedLogger(&log.Fields{ + "component": "preprocessor", + }) +} + // This functions starts the preprocessor responsible for preparing // the seeds sent by the reactor for captures func Start(inputChan, outputChan chan *models.Item) error { @@ -37,7 +45,7 @@ func Start(inputChan, outputChan chan *models.Item) error { } globalPreprocessor.wg.Add(1) go globalPreprocessor.run() - slog.Info("preprocessor started") + logger.Info("started") done = true }) @@ -53,7 +61,7 @@ func Stop() { globalPreprocessor.cancel() globalPreprocessor.wg.Wait() close(globalPreprocessor.output) - slog.Info("preprocessor stopped") + logger.Info("stopped") } } @@ -69,7 +77,7 @@ func (p *preprocessor) run() { select { // Closes the run routine when context is canceled case <-p.ctx.Done(): - slog.Info("preprocessor shutting down") + logger.Info("shutting down") return case item, ok := <-p.input: if ok { @@ -92,7 +100,7 @@ func (p *preprocessor) preprocess(item *models.Item) { // Preprocess the item's URL itself item.URL.Value, err = validateURL(item.URL.Value, nil) if err != nil { - slog.Warn("unable to validate URL", "url", item.URL.Value, "err", err.Error(), "func", "preprocessor.preprocess") + logger.Warn("unable to validate URL", "url", item.URL.Value, "err", err.Error(), "func", "preprocessor.preprocess") return } } else if len(item.Childs) > 0 { @@ -102,14 +110,14 @@ func (p *preprocessor) preprocess(item *models.Item) { item.Childs[i].Value, err = validateURL(child.Value, item.URL) if err != nil { // If we can't validate an URL, we remove it from the list of childs - slog.Warn("unable to validate URL", "url", child.Value, "err", err.Error(), "func", "preprocessor.preprocess") + logger.Warn("unable to validate URL", "url", child.Value, "err", err.Error(), "func", "preprocessor.preprocess") item.Childs = append(item.Childs[:i], item.Childs[i+1:]...) } else { i++ } } } else { - slog.Error("item got into preprocessing without anything to preprocess") + logger.Error("item got into preprocessing without anything to preprocess") } // Final step, send the preprocessed item to the output chan of the preprocessor From 703649afcabcbd2dbe91a0c0225af376de141ca1 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Tue, 19 Nov 2024 14:29:54 +0100 Subject: [PATCH 019/295] proper handling of start and stop of the logger --- internal/pkg/log/logger.go | 2 +- internal/pkg/preprocessor/preprocessor.go | 1 - internal/pkg/reactor/reactor.go | 1 - main.go | 1 + 4 files changed, 2 insertions(+), 3 deletions(-) diff --git a/internal/pkg/log/logger.go b/internal/pkg/log/logger.go index cac423c5..5162e031 100644 --- a/internal/pkg/log/logger.go +++ b/internal/pkg/log/logger.go @@ -25,11 +25,11 @@ func setupLogger() { ctx, cancelFunc = context.WithCancel(context.Background()) // Start the log processing goroutine - wg.Add(1) go processLogQueue(ctx) } func processLogQueue(ctx context.Context) { + wg.Add(1) defer wg.Done() // Initialize log destinations diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go index 19530267..72dbb462 100644 --- a/internal/pkg/preprocessor/preprocessor.go +++ b/internal/pkg/preprocessor/preprocessor.go @@ -24,7 +24,6 @@ var ( ) func init() { - log.Init() logger = log.NewFieldedLogger(&log.Fields{ "component": "preprocessor", }) diff --git a/internal/pkg/reactor/reactor.go b/internal/pkg/reactor/reactor.go index db4c296c..20526cdf 100644 --- a/internal/pkg/reactor/reactor.go +++ b/internal/pkg/reactor/reactor.go @@ -27,7 +27,6 @@ var ( ) func init() { - log.Init() logger = log.NewFieldedLogger(&log.Fields{ "component": "reactor", }) diff --git a/main.go b/main.go index f268c106..4ec356cc 100644 --- a/main.go +++ b/main.go @@ -32,6 +32,7 @@ func init() { } func main() { + defer log.Shutdown() if err := cmd.Run(); err != nil { fmt.Println(err) os.Exit(1) From d4ca7dd31f382d42da16ac67d5e25cf1100bf1d6 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Tue, 19 Nov 2024 14:32:31 +0100 Subject: [PATCH 020/295] logger: use tabs instead of spaces --- internal/pkg/log/logger.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/pkg/log/logger.go b/internal/pkg/log/logger.go index 5162e031..1fd5a8f6 100644 --- a/internal/pkg/log/logger.go +++ b/internal/pkg/log/logger.go @@ -74,7 +74,7 @@ func formatArgs(args []any) string { sb.WriteString(fmt.Sprintf("%v", args[i])) } if i+2 < len(args) { - sb.WriteString(" ") + sb.WriteString("\t") } } @@ -84,5 +84,5 @@ func formatArgs(args []any) string { // Helper function to format log entries func formatLogEntry(entry *logEntry) string { - return fmt.Sprintf("%s [%s] %s %s", entry.timestamp.Format(time.RFC3339), entry.level.String(), entry.msg, formatArgs(entry.args)) + return fmt.Sprintf("%s [%s] %s\t%s", entry.timestamp.Format(time.RFC3339), entry.level.String(), entry.msg, formatArgs(entry.args)) } From 403cbbd3693581e0bd34a62a252923b1625b0b53 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Tue, 19 Nov 2024 14:53:58 +0100 Subject: [PATCH 021/295] global: init logger for every package so that logging works on unit tests --- internal/pkg/log/logger.go | 2 +- internal/pkg/preprocessor/preprocessor.go | 2 ++ internal/pkg/reactor/reactor.go | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/internal/pkg/log/logger.go b/internal/pkg/log/logger.go index 1fd5a8f6..e0d432da 100644 --- a/internal/pkg/log/logger.go +++ b/internal/pkg/log/logger.go @@ -18,7 +18,7 @@ type logEntry struct { func setupLogger() { // Initialize the log queue - logQueue = make(chan *logEntry, 1000) + logQueue = make(chan *logEntry, 10000) // Create a cancellable context var ctx context.Context diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go index 72dbb462..35bf231a 100644 --- a/internal/pkg/preprocessor/preprocessor.go +++ b/internal/pkg/preprocessor/preprocessor.go @@ -24,6 +24,7 @@ var ( ) func init() { + log.Init() logger = log.NewFieldedLogger(&log.Fields{ "component": "preprocessor", }) @@ -61,6 +62,7 @@ func Stop() { globalPreprocessor.wg.Wait() close(globalPreprocessor.output) logger.Info("stopped") + log.Shutdown() } } diff --git a/internal/pkg/reactor/reactor.go b/internal/pkg/reactor/reactor.go index 20526cdf..db4c296c 100644 --- a/internal/pkg/reactor/reactor.go +++ b/internal/pkg/reactor/reactor.go @@ -27,6 +27,7 @@ var ( ) func init() { + log.Init() logger = log.NewFieldedLogger(&log.Fields{ "component": "reactor", }) From 2356d9af8025673158c56d6fc671a83c6164c4f4 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Tue, 19 Nov 2024 14:56:21 +0100 Subject: [PATCH 022/295] rework URL struct --- internal/pkg/preprocessor/preprocessor.go | 9 +- internal/pkg/preprocessor/url.go | 6 +- internal/pkg/reactor/reactor_test.go | 4 +- internal/pkg/source/hq.go | 143 +++++++++++----------- internal/pkg/utils/url.go | 116 +----------------- internal/pkg/utils/url_test.go | 54 ++++---- pkg/models/seed.go | 2 +- pkg/models/url.go | 91 +++++++++++++- pkg/models/urltype.go | 19 +++ 9 files changed, 223 insertions(+), 221 deletions(-) create mode 100644 pkg/models/urltype.go diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go index 35bf231a..a05ebd2c 100644 --- a/internal/pkg/preprocessor/preprocessor.go +++ b/internal/pkg/preprocessor/preprocessor.go @@ -99,19 +99,18 @@ func (p *preprocessor) preprocess(item *models.Item) { var err error if item.Status == models.ItemFresh { // Preprocess the item's URL itself - item.URL.Value, err = validateURL(item.URL.Value, nil) + err = validateURL(item.URL, nil) if err != nil { - logger.Warn("unable to validate URL", "url", item.URL.Value, "err", err.Error(), "func", "preprocessor.preprocess") + logger.Warn("unable to validate URL", "url", item.URL.Raw, "err", err.Error(), "func", "preprocessor.preprocess") return } } else if len(item.Childs) > 0 { // Preprocess the childs for i := 0; i < len(item.Childs); { - child := item.Childs[i] - item.Childs[i].Value, err = validateURL(child.Value, item.URL) + err = validateURL(item.Childs[i], item.URL) if err != nil { // If we can't validate an URL, we remove it from the list of childs - logger.Warn("unable to validate URL", "url", child.Value, "err", err.Error(), "func", "preprocessor.preprocess") + logger.Warn("unable to validate URL", "url", item.Childs[i].Raw, "err", err.Error(), "func", "preprocessor.preprocess") item.Childs = append(item.Childs[:i], item.Childs[i+1:]...) } else { i++ diff --git a/internal/pkg/preprocessor/url.go b/internal/pkg/preprocessor/url.go index f4cab0b8..29c68f9a 100644 --- a/internal/pkg/preprocessor/url.go +++ b/internal/pkg/preprocessor/url.go @@ -2,7 +2,7 @@ package preprocessor import "github.com/internetarchive/Zeno/pkg/models" -func validateURL(URL string, parentURL *models.URL) (validatedURL string, err error) { - // Validate the URL, make it absolute if needed, etc. - return URL, err +func validateURL(URL *models.URL, parentURL *models.URL) (err error) { + // Validate the URL, REMOVE FRAGMENTS, try to fix it, make it absolute if needed, etc. + return URL.Parse() } diff --git a/internal/pkg/reactor/reactor_test.go b/internal/pkg/reactor/reactor_test.go index d455b322..b440af6f 100644 --- a/internal/pkg/reactor/reactor_test.go +++ b/internal/pkg/reactor/reactor_test.go @@ -7,7 +7,6 @@ import ( "github.com/google/uuid" "github.com/internetarchive/Zeno/pkg/models" - "github.com/internetarchive/gocrawlhq" ) func TestReactorE2E(t *testing.T) { @@ -32,6 +31,7 @@ func TestReactorE2E(t *testing.T) { if item == nil { continue } + // Send feedback for the consumed item if item.Source != models.ItemSourceFeedback { err := ReceiveFeedback(item) @@ -67,7 +67,7 @@ func TestReactorE2E(t *testing.T) { uuid := uuid.New() mockItems = append(mockItems, &models.Item{ UUID: &uuid, - URL: &models.URL{URL: gocrawlhq.URL{Value: fmt.Sprintf("http://example.com/%d", i)}}, + URL: &models.URL{Raw: fmt.Sprintf("http://example.com/%d", i)}, Status: models.ItemFresh, Source: models.ItemSourceHQ, }) diff --git a/internal/pkg/source/hq.go b/internal/pkg/source/hq.go index 47c24f34..2add78f0 100644 --- a/internal/pkg/source/hq.go +++ b/internal/pkg/source/hq.go @@ -6,6 +6,7 @@ import ( "github.com/internetarchive/Zeno/internal/pkg/config" "github.com/internetarchive/Zeno/internal/pkg/utils" + "github.com/internetarchive/Zeno/pkg/models" "github.com/internetarchive/gocrawlhq" ) @@ -14,17 +15,12 @@ import ( // Zeno is connected. This "identify" message is sent every second and // contains the crawler's stats and details. func HQWebsocket() { - var ( - // the "identify" message will be sent every second - // to the crawl HQ - identifyTicker = time.NewTicker(time.Second) - ) + var identifyTicker = time.NewTicker(time.Second) defer func() { identifyTicker.Stop() }() - // send an "identify" message to the crawl HQ every second for { err := HQClient.Identify(&gocrawlhq.IdentifyMessage{ Project: config.Get().HQProject, @@ -46,6 +42,73 @@ func HQWebsocket() { } } +// func HQSeencheckURLs(URLs []*url.URL) (seencheckedBatch []*url.URL, err error) { +// var ( +// discoveredURLs []gocrawlhq.URL +// ) + +// for _, URL := range URLs { +// discoveredURLs = append(discoveredURLs, gocrawlhq.URL{ +// Value: utils.URLToString(URL), +// Type: "asset", +// }) +// } + +// outputURLs, err := HQClient.Seencheck(discoveredURLs) +// if err != nil { +// c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ +// "batchLen": len(URLs), +// "urls": discoveredURLs, +// })).Error("error sending seencheck payload to crawl HQ") +// return seencheckedBatch, err +// } + +// if outputURLs != nil { +// for _, URL := range outputURLs { +// // the returned payload only contain new URLs to be crawled by Zeno +// newURL, err := url.Parse(URL.Value) +// if err != nil { +// c.Log.WithFields(c.genLogFields(err, URL, map[string]interface{}{ +// "batchLen": len(URLs), +// })).Error("error parsing URL from HQ seencheck response") +// return seencheckedBatch, err +// } + +// seencheckedBatch = append(seencheckedBatch, newURL) +// } +// } + +// return seencheckedBatch, nil +// } + +// HQSeencheckURL uses Crawl HQ to determine if an URL has been seen before +// in the current crawl project. If the URL is new, it returns true, otherwise +// it returns false. If there's an error sending the payload to Crawl HQ, it +// returns an error and the URL is considered new. +func HQSeencheck(URL *models.URL) (bool, error) { + discoveredURL := gocrawlhq.URL{ + Value: URL.String(), + Type: URL.Type(), + } + + outputURLs, err := HQClient.Seencheck([]gocrawlhq.URL{discoveredURL}) + if err != nil { + slog.Error("error sending seencheck payload to crawl HQ", "err", err, "url", URL.String()) + return true, err // return true, don't discard the URL if there's an error + } + + if outputURLs != nil { + for _, URL := range outputURLs { + // the returned payload only contain new URLs to be crawled by Zeno + if URL.Value == discoveredURL.Value { + return true, nil + } + } + } + + return false, nil +} + // func HQProducer() { // defer c.HQChannelsWg.Done() @@ -314,71 +377,3 @@ func HQWebsocket() { // } // } // } - -// func HQSeencheckURLs(URLs []*url.URL) (seencheckedBatch []*url.URL, err error) { -// var ( -// discoveredURLs []gocrawlhq.URL -// ) - -// for _, URL := range URLs { -// discoveredURLs = append(discoveredURLs, gocrawlhq.URL{ -// Value: utils.URLToString(URL), -// Type: "asset", -// }) -// } - -// outputURLs, err := c.HQClient.Seencheck(discoveredURLs) -// if err != nil { -// c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ -// "batchLen": len(URLs), -// "urls": discoveredURLs, -// })).Error("error sending seencheck payload to crawl HQ") -// return seencheckedBatch, err -// } - -// if outputURLs != nil { -// for _, URL := range outputURLs { -// // the returned payload only contain new URLs to be crawled by Zeno -// newURL, err := url.Parse(URL.Value) -// if err != nil { -// c.Log.WithFields(c.genLogFields(err, URL, map[string]interface{}{ -// "batchLen": len(URLs), -// })).Error("error parsing URL from HQ seencheck response") -// return seencheckedBatch, err -// } - -// seencheckedBatch = append(seencheckedBatch, newURL) -// } -// } - -// return seencheckedBatch, nil -// } - -// returns: -// - bool: true if the URL is new, false if it has been seen before -// - error: if there's an error sending the payload to crawl HQ -// -// NOTE: if there's an error, the URL is considered new -// func HQSeencheckURL(URL *url.URL) (bool, error) { -// discoveredURL := gocrawlhq.URL{ -// Value: utils.URLToString(URL), -// Type: "asset", -// } - -// outputURLs, err := HQClient.Seencheck([]gocrawlhq.URL{discoveredURL}) -// if err != nil { -// Log.Error("error sending seencheck payload to crawl HQ", "err", err, "url", utils.URLToString(URL)) -// return true, err // return true, don't discard the URL if there's an error -// } - -// if outputURLs != nil { -// for _, URL := range outputURLs { -// // the returned payload only contain new URLs to be crawled by Zeno -// if URL.Value == discoveredURL.Value { -// return true, nil -// } -// } -// } - -// return false, nil -// } diff --git a/internal/pkg/utils/url.go b/internal/pkg/utils/url.go index 7ff4507f..894cc122 100644 --- a/internal/pkg/utils/url.go +++ b/internal/pkg/utils/url.go @@ -1,108 +1,19 @@ package utils import ( - "errors" - "log/slog" - "net" - "net/url" - "strings" - - "github.com/asaskevich/govalidator" - "golang.org/x/net/idna" + "github.com/internetarchive/Zeno/pkg/models" ) -func URLToString(u *url.URL) string { - var err error - - switch u.Host { - case "external-preview.redd.it", "styles.redditmedia.com", "preview.redd.it": - // Do nothing. We don't want to encode the URL for signature purposes. :( - break - default: - q := u.Query() - u.RawQuery = encodeQuery(q) - } - u.Host, err = idna.ToASCII(u.Host) - if err != nil { - if strings.Contains(u.Host, ":") { - hostWithoutPort, port, err := net.SplitHostPort(u.Host) - if err != nil { - slog.Warn("cannot split host and port", "error", err) - } else { - asciiHost, err := idna.ToASCII(hostWithoutPort) - if err == nil { - u.Host = asciiHost + ":" + port - } else { - slog.Warn("cannot encode punycode host without port to ASCII", "error", err) - } - } - } else { - slog.Warn("cannot encode punycode host to ASCII", "error", err) - } - } - - return u.String() -} - -// Encode encodes the values into “URL encoded” form -// from: https://cs.opensource.google/go/go/+/refs/tags/go1.23.1:src/net/url/url.go;l=1002 -// modified to not sort. -func encodeQuery(v url.Values) string { - if len(v) == 0 { - return "" - } - var buf strings.Builder - keys := make([]string, 0, len(v)) - for k := range v { - keys = append(keys, k) - } - // Modified to not sort the keys. - // slices.Sort(keys) - for _, k := range keys { - vs := v[k] - keyEscaped := url.QueryEscape(k) - for _, v := range vs { - if buf.Len() > 0 { - buf.WriteByte('&') - } - buf.WriteString(keyEscaped) - buf.WriteByte('=') - buf.WriteString(url.QueryEscape(v)) - } - } - return buf.String() -} - -// MakeAbsolute turn all URLs in a slice of url.URL into absolute URLs, based -// on a given base *url.URL -func MakeAbsolute(base *url.URL, URLs []*url.URL) []*url.URL { - for i, URL := range URLs { - if !URL.IsAbs() { - URLs[i] = base.ResolveReference(URL) - } - } - - return URLs -} - -func RemoveFragments(URLs []*url.URL) []*url.URL { - for i := range URLs { - URLs[i].Fragment = "" - } - - return URLs -} - // DedupeURLs take a slice of *url.URL and dedupe it -func DedupeURLs(URLs []*url.URL) []*url.URL { +func DedupeURLs(URLs []*models.URL) []*models.URL { keys := make(map[string]bool) - list := []*url.URL{} + list := make([]*models.URL, 0, len(URLs)) for _, entry := range URLs { - if _, value := keys[URLToString(entry)]; !value { - keys[URLToString(entry)] = true + if _, value := keys[entry.String()]; !value { + keys[entry.String()] = true - if entry.Scheme == "http" || entry.Scheme == "https" { + if entry.Parsed().Scheme == "http" || entry.Parsed().Scheme == "https" { list = append(list, entry) } } @@ -110,18 +21,3 @@ func DedupeURLs(URLs []*url.URL) []*url.URL { return list } - -// ValidateURL validates a *url.URL -func ValidateURL(u *url.URL) error { - valid := govalidator.IsURL(URLToString(u)) - - if u.Scheme != "http" && u.Scheme != "https" { - valid = false - } - - if !valid { - return errors.New("not a valid URL") - } - - return nil -} diff --git a/internal/pkg/utils/url_test.go b/internal/pkg/utils/url_test.go index d160b490..cd777c4e 100644 --- a/internal/pkg/utils/url_test.go +++ b/internal/pkg/utils/url_test.go @@ -1,123 +1,133 @@ package utils import ( - "net/url" "testing" + + "github.com/internetarchive/Zeno/pkg/models" ) func TestURLToStringPunycode(t *testing.T) { - u, err := url.Parse("https://xn----8sbddjhbicfsohgbg1aeo.xn--p1ia/pic/file/map_of_sarlat.pdf") + u := &models.URL{Raw: "https://xn----8sbddjhbicfsohgbg1aeo.xn--p1ia/pic/file/map_of_sarlat.pdf"} + err := u.Parse() if err != nil { - t.Fatalf("Error parsing URL: %v", err) + t.Fatalf("error parsing URL: %v", err) } expected := "https://xn----8sbddjhbicfsohgbg1aeo.xn--p1ia/pic/file/map_of_sarlat.pdf" - actual := URLToString(u) + actual := u.String() if actual != expected { - t.Fatalf("Expected %s, got %s", expected, actual) + t.Fatalf("expected %s, got %s", expected, actual) } } func TestURLToStringPunycodeWithPort(t *testing.T) { - u, err := url.Parse("https://xn----8sbddjhbicfsohgbg1aeo.xn--p1ia:8080/pic/file/map_of_sarlat.pdf") + u := &models.URL{Raw: "https://xn----8sbddjhbicfsohgbg1aeo.xn--p1ia:8080/pic/file/map_of_sarlat.pdf"} + err := u.Parse() if err != nil { t.Fatalf("Error parsing URL: %v", err) } expected := "https://xn----8sbddjhbicfsohgbg1aeo.xn--p1ia:8080/pic/file/map_of_sarlat.pdf" - actual := URLToString(u) + actual := u.String() if actual != expected { t.Fatalf("Expected %s, got %s", expected, actual) } } func TestURLToStringUnicodetoIDNA(t *testing.T) { - u, err := url.Parse("https://о-змладйвеклблнозеж.xn--p1ia:8080/pic/file/map_of_sarlat.pdf") + u := &models.URL{Raw: "https://о-змладйвеклблнозеж.xn--p1ia:8080/pic/file/map_of_sarlat.pdf"} + err := u.Parse() if err != nil { t.Fatalf("Error parsing URL: %v", err) } expected := "https://xn----8sbddjhbicfsohgbg1aeo.xn--p1ia:8080/pic/file/map_of_sarlat.pdf" - actual := URLToString(u) + actual := u.String() if actual != expected { t.Fatalf("Expected %s, got %s", expected, actual) } } func TestURLToStringWithPath(t *testing.T) { - u, err := url.Parse("http://παράδειγμα.δοκιμή/Αρχική_σελίδα") + u := &models.URL{Raw: "http://παράδειγμα.δοκιμή/Αρχική_σελίδα"} + err := u.Parse() if err != nil { t.Fatalf("Error parsing URL: %v", err) } expected := "http://xn--hxajbheg2az3al.xn--jxalpdlp/%CE%91%CF%81%CF%87%CE%B9%CE%BA%CE%AE_%CF%83%CE%B5%CE%BB%CE%AF%CE%B4%CE%B1" - actual := URLToString(u) + actual := u.String() if actual != expected { t.Fatalf("Expected %s, got %s", expected, actual) } } func TestURLToStringUnicodetoIDNAWithPort(t *testing.T) { - u, err := url.Parse("https://о-змладйвеклблнозеж.xn--p1ia:8080/pic/file/map_of_sarlat.pdf") + u := &models.URL{Raw: "https://о-змладйвеклблнозеж.xn--p1ia:8080/pic/file/map_of_sarlat.pdf"} + err := u.Parse() if err != nil { t.Fatalf("Error parsing URL: %v", err) } expected := "https://xn----8sbddjhbicfsohgbg1aeo.xn--p1ia:8080/pic/file/map_of_sarlat.pdf" - actual := URLToString(u) + actual := u.String() if actual != expected { t.Fatalf("Expected %s, got %s", expected, actual) } } func TestURLwithIPv6(t *testing.T) { - u, err := url.Parse("https://[2600:4040:23c7:a620:3642:ebaa:ab23:735e]/test") + u := &models.URL{Raw: "https://[2600:4040:23c7:a620:3642:ebaa:ab23:735e]/test"} + err := u.Parse() if err != nil { t.Fatalf("Error parsing URL: %v", err) } expected := "https://[2600:4040:23c7:a620:3642:ebaa:ab23:735e]/test" - actual := URLToString(u) + actual := u.String() if actual != expected { t.Fatalf("Expected %s, got %s", expected, actual) } } func TestURLwithIPv6WithPort(t *testing.T) { - u, err := url.Parse("https://[2600:4040:23c7:a620:3642:ebaa:ab23:735e]:8080/test") + u := &models.URL{Raw: "https://[2600:4040:23c7:a620:3642:ebaa:ab23:735e]:8080/test"} + err := u.Parse() if err != nil { t.Fatalf("Error parsing URL: %v", err) } expected := "https://[2600:4040:23c7:a620:3642:ebaa:ab23:735e]:8080/test" - actual := URLToString(u) + actual := u.String() if actual != expected { t.Fatalf("Expected %s, got %s", expected, actual) } } func TestURLwithSpacesandUnicode(t *testing.T) { - u, err := url.Parse("https://www.youtube.com/watch/0HBwC_wIFF4?t=18363石神視点【Minecraft】平日もど真ん中なんだから早く寝なきゃ【石神のぞみ/にじさんじ所属】https://www.youtube.com/watch/L30uAR9X8Uw?t=10100【倉持エン足中") + u := &models.URL{Raw: "https://www.youtube.com/watch/0HBwC_wIFF4?t=18363石神視点【Minecraft】平日もど真ん中なんだから早く寝なきゃ【石神のぞみ/にじさんじ所属】https://www.youtube.com/watch/L30uAR9X8Uw?t=10100【倉持エン足中"} + err := u.Parse() if err != nil { t.Fatalf("Error parsing URL: %v", err) } expected := "https://www.youtube.com/watch/0HBwC_wIFF4?t=18363%E7%9F%B3%E7%A5%9E%E8%A6%96%E7%82%B9%E3%80%90Minecraft%E3%80%91%E5%B9%B3%E6%97%A5%E3%82%82%E3%81%A9%E7%9C%9F%E3%82%93%E4%B8%AD%E3%81%AA%E3%82%93%E3%81%A0%E3%81%8B%E3%82%89%E6%97%A9%E3%81%8F%E5%AF%9D%E3%81%AA%E3%81%8D%E3%82%83%E3%80%90%E7%9F%B3%E7%A5%9E%E3%81%AE%E3%81%9E%E3%81%BF%EF%BC%8F%E3%81%AB%E3%81%98%E3%81%95%E3%82%93%E3%81%98%E6%89%80%E5%B1%9E%E3%80%91https%3A%2F%2Fwww.youtube.com%2Fwatch%2FL30uAR9X8Uw%3Ft%3D10100%E3%80%90%E5%80%89%E6%8C%81%E3%82%A8%E3%83%B3%E8%B6%B3%E4%B8%AD" - actual := URLToString(u) + actual := u.String() if actual != expected { t.Fatalf("Expected %s, got %s", expected, actual) } } -// For technical reasons we are not encoding reddit URLs. +// For technical reasons we are not encoding Reddit URLs. func TestURLwithRedditOverride(t *testing.T) { - u, err := url.Parse("https://styles.redditmedia.com/t5_7wkhw/styles/profileIcon_8w6r6fr3rh2d1.jpeg?width=64&height=64&frame=1&auto=webp&crop=64:64,smart&s=6d8ab9b89c9b846c9eb65622db9ced4992dc0905") + u := &models.URL{Raw: "https://styles.redditmedia.com/t5_7wkhw/styles/profileIcon_8w6r6fr3rh2d1.jpeg?width=64&height=64&frame=1&auto=webp&crop=64:64,smart&s=6d8ab9b89c9b846c9eb65622db9ced4992dc0905"} + err := u.Parse() if err != nil { t.Fatalf("Error parsing URL: %v", err) } expected := "https://styles.redditmedia.com/t5_7wkhw/styles/profileIcon_8w6r6fr3rh2d1.jpeg?width=64&height=64&frame=1&auto=webp&crop=64:64,smart&s=6d8ab9b89c9b846c9eb65622db9ced4992dc0905" - actual := URLToString(u) + actual := u.String() if actual != expected { t.Fatalf("Expected %s, got %s", expected, actual) } diff --git a/pkg/models/seed.go b/pkg/models/seed.go index 8f6a7a8d..57f4421f 100644 --- a/pkg/models/seed.go +++ b/pkg/models/seed.go @@ -7,7 +7,7 @@ import ( // Item represents a URL, it's childs (e.g. discovered assets) and it's state in the pipeline type Item struct { UUID *uuid.UUID // UUID is the unique identifier of the item - URL *URL // URL is the URL of the item + URL *URL // URL is a struct that contains the URL, the parsed URL, and its hop Status ItemState // Status is the state of the item in the pipeline Source ItemSource // Source is the source of the item in the pipeline ChildsCaptured bool // ChildsCaptured is the flag to indicate if the child URLs of the item have been captured diff --git a/pkg/models/url.go b/pkg/models/url.go index 121c08f2..4f636cd0 100644 --- a/pkg/models/url.go +++ b/pkg/models/url.go @@ -1,15 +1,98 @@ package models import ( + "log/slog" + "net" "net/url" + "strings" - "github.com/internetarchive/gocrawlhq" + "golang.org/x/net/idna" ) type URL struct { - gocrawlhq.URL + Raw string + parsed *url.URL + hop int // This determines the number of hops this item is the result of, a hop is a "jump" from 1 page to another page + urlType URLType } -func (u *URL) Parsed() (URL *url.URL, err error) { - return url.Parse(u.Value) +func (u *URL) Parse() (err error) { + u.parsed, err = url.ParseRequestURI(u.Raw) + return err +} + +func (u *URL) Parsed() *url.URL { + return u.parsed +} + +func (u *URL) Type() string { + return u.urlType.String() +} + +func (u *URL) Hop() int { + return u.hop +} + +// String exists to apply some custom stuff, in opposition of simply +// using the u.parsed.String() method +func (u *URL) String() string { + var err error + + switch u.parsed.Host { + case "external-preview.redd.it", "styles.redditmedia.com", "preview.redd.it": + // Do nothing. We don't want to encode the URL for signature purposes. :( + break + default: + q := u.parsed.Query() + u.parsed.RawQuery = encodeQuery(q) + } + u.parsed.Host, err = idna.ToASCII(u.parsed.Host) + if err != nil { + if strings.Contains(u.parsed.Host, ":") { + hostWithoutPort, port, err := net.SplitHostPort(u.parsed.Host) + if err != nil { + slog.Warn("cannot split host and port", "error", err) + } else { + asciiHost, err := idna.ToASCII(hostWithoutPort) + if err == nil { + u.parsed.Host = asciiHost + ":" + port + } else { + slog.Warn("cannot encode punycode host without port to ASCII", "error", err) + } + } + } else { + slog.Warn("cannot encode punycode host to ASCII", "error", err) + } + } + + return u.parsed.String() +} + +// Encode encodes the values into “URL encoded” form +// from: https://cs.opensource.google/go/go/+/refs/tags/go1.23.1:src/net/url/url.go;l=1002 +// REASON: it has been modified to not sort +func encodeQuery(v url.Values) string { + if len(v) == 0 { + return "" + } + var buf strings.Builder + keys := make([]string, 0, len(v)) + for k := range v { + keys = append(keys, k) + } + // Modified to not sort the keys. + // slices.Sort(keys) + for _, k := range keys { + vs := v[k] + keyEscaped := url.QueryEscape(k) + for _, v := range vs { + if buf.Len() > 0 { + buf.WriteByte('&') + } + buf.WriteString(keyEscaped) + buf.WriteByte('=') + buf.WriteString(url.QueryEscape(v)) + } + } + return buf.String() } diff --git a/pkg/models/urltype.go b/pkg/models/urltype.go new file mode 100644 index 00000000..3e334801 --- /dev/null +++ b/pkg/models/urltype.go @@ -0,0 +1,19 @@ +package models + +type URLType int64 + +const ( + Seed URLType = iota + Asset +) + +func (t URLType) String() string { + switch t { + case Seed: + return "seed" + case Asset: + return "asset" + } + + return "" +} From 661424d662758ffa509c4c4c13622c9b6b874350 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Tue, 19 Nov 2024 15:00:34 +0100 Subject: [PATCH 023/295] global: change the init of logger of packages that import it --- internal/pkg/log/logger.go | 1 - internal/pkg/preprocessor/preprocessor.go | 12 +++++------- internal/pkg/reactor/reactor.go | 12 +++++------- main.go | 8 +++----- 4 files changed, 13 insertions(+), 20 deletions(-) diff --git a/internal/pkg/log/logger.go b/internal/pkg/log/logger.go index e0d432da..d64a88a3 100644 --- a/internal/pkg/log/logger.go +++ b/internal/pkg/log/logger.go @@ -83,6 +83,5 @@ func formatArgs(args []any) string { // Helper function to format log entries func formatLogEntry(entry *logEntry) string { - return fmt.Sprintf("%s [%s] %s\t%s", entry.timestamp.Format(time.RFC3339), entry.level.String(), entry.msg, formatArgs(entry.args)) } diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go index a05ebd2c..6a10c4d4 100644 --- a/internal/pkg/preprocessor/preprocessor.go +++ b/internal/pkg/preprocessor/preprocessor.go @@ -23,18 +23,16 @@ var ( logger *log.FieldedLogger ) -func init() { - log.Init() - logger = log.NewFieldedLogger(&log.Fields{ - "component": "preprocessor", - }) -} - // This functions starts the preprocessor responsible for preparing // the seeds sent by the reactor for captures func Start(inputChan, outputChan chan *models.Item) error { var done bool + log.Init() + logger = log.NewFieldedLogger(&log.Fields{ + "component": "preprocessor", + }) + once.Do(func() { ctx, cancel := context.WithCancel(context.Background()) globalPreprocessor = &preprocessor{ diff --git a/internal/pkg/reactor/reactor.go b/internal/pkg/reactor/reactor.go index db4c296c..c467e524 100644 --- a/internal/pkg/reactor/reactor.go +++ b/internal/pkg/reactor/reactor.go @@ -26,18 +26,16 @@ var ( logger *log.FieldedLogger ) -func init() { - log.Init() - logger = log.NewFieldedLogger(&log.Fields{ - "component": "reactor", - }) -} - // Start initializes the global reactor with the given maximum tokens. // This method can only be called once. func Start(maxTokens int, outputChan chan *models.Item) error { var done bool + log.Init() + logger = log.NewFieldedLogger(&log.Fields{ + "component": "reactor", + }) + once.Do(func() { ctx, cancel := context.WithCancel(context.Background()) globalReactor = &reactor{ diff --git a/main.go b/main.go index 4ec356cc..80418fe4 100644 --- a/main.go +++ b/main.go @@ -24,15 +24,13 @@ var ( logger *log.FieldedLogger ) -func init() { +func main() { log.Init() logger = log.NewFieldedLogger(&log.Fields{ - "component": "main", + "component": "preprocessor", }) -} - -func main() { defer log.Shutdown() + if err := cmd.Run(); err != nil { fmt.Println(err) os.Exit(1) From 78669aa80ebcc792061f13185da301cb22962eb6 Mon Sep 17 00:00:00 2001 From: Will Howes Date: Tue, 19 Nov 2024 16:58:08 +0100 Subject: [PATCH 024/295] remove urlType from URL struct --- pkg/models/url.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pkg/models/url.go b/pkg/models/url.go index 4f636cd0..19f7f334 100644 --- a/pkg/models/url.go +++ b/pkg/models/url.go @@ -10,10 +10,9 @@ import ( ) type URL struct { - Raw string - parsed *url.URL - hop int // This determines the number of hops this item is the result of, a hop is a "jump" from 1 page to another page - urlType URLType + Raw string + parsed *url.URL + hop int // This determines the number of hops this item is the result of, a hop is a "jump" from 1 page to another page } func (u *URL) Parse() (err error) { From 89a466feb0358fb422242b2c38332915b6279793 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Tue, 19 Nov 2024 17:06:21 +0100 Subject: [PATCH 025/295] reactor: more/better tests --- internal/pkg/reactor/reactor_test.go | 44 +++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/internal/pkg/reactor/reactor_test.go b/internal/pkg/reactor/reactor_test.go index b440af6f..5847372d 100644 --- a/internal/pkg/reactor/reactor_test.go +++ b/internal/pkg/reactor/reactor_test.go @@ -6,17 +6,44 @@ import ( "time" "github.com/google/uuid" + "github.com/internetarchive/Zeno/internal/pkg/log" "github.com/internetarchive/Zeno/pkg/models" ) -func TestReactorE2E(t *testing.T) { +func TestReactor_E2E_Balanced(t *testing.T) { + _testerFunc(50, 50, 1000, t) +} + +func TestReactor_E2E_Unbalanced_MoreConsumers(t *testing.T) { + _testerFunc(10, 50, 1000, t) +} + +func TestReactor_E2E_Unbalanced_MoreTokens(t *testing.T) { + _testerFunc(50, 10, 1000, t) +} + +func TestReactor_E2E_BalancedBig(t *testing.T) { + _testerFunc(5000, 5000, 1000000, t) +} + +func TestReactor_E2E_UnbalancedBig_MoreConsumers(t *testing.T) { + _testerFunc(50, 5000, 1000000, t) +} + +func TestReactor_E2E_UnbalancedBig_MoreTokens(t *testing.T) { + _testerFunc(50, 5000, 1000000, t) +} + +func _testerFunc(tokens, consumers, seeds int, t testing.TB) { // Initialize the reactor with a maximum of 5 tokens outputChan := make(chan *models.Item) err := Start(1, outputChan) + if err != nil { t.Logf("Error starting reactor: %s", err) return } + defer log.Shutdown() defer Stop() // Channel to collect errors from goroutines @@ -82,8 +109,17 @@ func TestReactorE2E(t *testing.T) { } // Allow some time for processing - time.Sleep(5 * time.Second) - if len(GetStateTable()) > 0 { - t.Fatalf("State table is not empty: %s", GetStateTable()) + for { + select { + case <-time.After(5 * time.Second): + if len(GetStateTable()) > 0 { + t.Fatalf("State table is not empty: %s", GetStateTable()) + } + t.Fatalf("Timeout waiting for reactor to finish processing") + default: + if len(GetStateTable()) == 0 { + return + } + } } } From 515f36e8174f3127bc30c2a1c54d9ceeeb909e29 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Tue, 19 Nov 2024 17:14:50 +0100 Subject: [PATCH 026/295] models/url: urlType still used in method --- pkg/models/url.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/models/url.go b/pkg/models/url.go index 19f7f334..a5b0aefd 100644 --- a/pkg/models/url.go +++ b/pkg/models/url.go @@ -25,7 +25,7 @@ func (u *URL) Parsed() *url.URL { } func (u *URL) Type() string { - return u.urlType.String() + return u.String() } func (u *URL) Hop() int { From b48a748b8c289117369e677bd5455b2bd2579af8 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Tue, 19 Nov 2024 17:15:54 +0100 Subject: [PATCH 027/295] log: renamed Shutdown to Stop and removed Stop from other packages stop functions --- internal/pkg/log/log.go | 4 ++-- internal/pkg/preprocessor/preprocessor.go | 1 - internal/pkg/reactor/reactor_test.go | 2 -- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/internal/pkg/log/log.go b/internal/pkg/log/log.go index 307be538..c461d09d 100644 --- a/internal/pkg/log/log.go +++ b/internal/pkg/log/log.go @@ -62,8 +62,8 @@ func logWithLevel(level slog.Level, msg string, args ...any) { } } -// Shutdown gracefully shuts down the logging system -func Shutdown() { +// Stop gracefully shuts down the logging system +func Stop() { if cancelFunc != nil { cancelFunc() } diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go index 6a10c4d4..804a270f 100644 --- a/internal/pkg/preprocessor/preprocessor.go +++ b/internal/pkg/preprocessor/preprocessor.go @@ -60,7 +60,6 @@ func Stop() { globalPreprocessor.wg.Wait() close(globalPreprocessor.output) logger.Info("stopped") - log.Shutdown() } } diff --git a/internal/pkg/reactor/reactor_test.go b/internal/pkg/reactor/reactor_test.go index 5847372d..c7482a19 100644 --- a/internal/pkg/reactor/reactor_test.go +++ b/internal/pkg/reactor/reactor_test.go @@ -6,7 +6,6 @@ import ( "time" "github.com/google/uuid" - "github.com/internetarchive/Zeno/internal/pkg/log" "github.com/internetarchive/Zeno/pkg/models" ) @@ -43,7 +42,6 @@ func _testerFunc(tokens, consumers, seeds int, t testing.TB) { t.Logf("Error starting reactor: %s", err) return } - defer log.Shutdown() defer Stop() // Channel to collect errors from goroutines From 30d7251014db07b64daf249ba7bb20438e3c3fa9 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Tue, 19 Nov 2024 17:16:12 +0100 Subject: [PATCH 028/295] add finish package --- internal/pkg/finish/finish.go | 1 + 1 file changed, 1 insertion(+) create mode 100644 internal/pkg/finish/finish.go diff --git a/internal/pkg/finish/finish.go b/internal/pkg/finish/finish.go new file mode 100644 index 00000000..5105e663 --- /dev/null +++ b/internal/pkg/finish/finish.go @@ -0,0 +1 @@ +package finish From 784655503e3bc19060907f3fea816410ec89110c Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Tue, 19 Nov 2024 17:39:50 +0100 Subject: [PATCH 029/295] log: fixed start/close behaviour ; reactor: made tests handle error better --- internal/pkg/log/error.go | 8 ++++ internal/pkg/log/log.go | 16 +++++++- internal/pkg/preprocessor/preprocessor.go | 2 +- internal/pkg/reactor/reactor.go | 8 +++- internal/pkg/reactor/reactor_test.go | 47 +++++++++++++---------- main.go | 4 +- 6 files changed, 58 insertions(+), 27 deletions(-) create mode 100644 internal/pkg/log/error.go diff --git a/internal/pkg/log/error.go b/internal/pkg/log/error.go new file mode 100644 index 00000000..dc99de15 --- /dev/null +++ b/internal/pkg/log/error.go @@ -0,0 +1,8 @@ +package log + +import "errors" + +var ( + // ErrLoggerAlreadyInitialized is the error returned when the logger is already initialized + ErrLoggerAlreadyInitialized = errors.New("logger already initialized") +) diff --git a/internal/pkg/log/log.go b/internal/pkg/log/log.go index c461d09d..40f3638d 100644 --- a/internal/pkg/log/log.go +++ b/internal/pkg/log/log.go @@ -17,17 +17,27 @@ var ( cancelFunc context.CancelFunc ) -// Init initializes the logging package with the given configuration. +// Start initializes the logging package with the given configuration. // If no configuration is provided, it uses the default configuration. -func Init(cfgs ...*Config) { +func Start(cfgs ...*Config) error { + var done = false + once.Do(func() { + logQueue = make(chan *logEntry, 1000) if len(cfgs) > 0 && cfgs[0] != nil { config = cfgs[0] } else { config = defaultConfig() } setupLogger() + done = true }) + + if !done { + return ErrLoggerAlreadyInitialized + } + + return nil } // Public logging methods @@ -68,4 +78,6 @@ func Stop() { cancelFunc() } wg.Wait() + close(logQueue) + once = sync.Once{} } diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go index 804a270f..bf5e5f44 100644 --- a/internal/pkg/preprocessor/preprocessor.go +++ b/internal/pkg/preprocessor/preprocessor.go @@ -28,7 +28,7 @@ var ( func Start(inputChan, outputChan chan *models.Item) error { var done bool - log.Init() + log.Start() logger = log.NewFieldedLogger(&log.Fields{ "component": "preprocessor", }) diff --git a/internal/pkg/reactor/reactor.go b/internal/pkg/reactor/reactor.go index c467e524..53ea6a15 100644 --- a/internal/pkg/reactor/reactor.go +++ b/internal/pkg/reactor/reactor.go @@ -18,6 +18,7 @@ type reactor struct { output chan *models.Item // Output channel stateTable sync.Map // State table for tracking seeds by UUID wg sync.WaitGroup // WaitGroup to manage goroutines + // stopChan chan struct{} // Channel to signal when stop is finished } var ( @@ -31,7 +32,7 @@ var ( func Start(maxTokens int, outputChan chan *models.Item) error { var done bool - log.Init() + log.Start() logger = log.NewFieldedLogger(&log.Fields{ "component": "reactor", }) @@ -45,6 +46,7 @@ func Start(maxTokens int, outputChan chan *models.Item) error { input: make(chan *models.Item, maxTokens), output: outputChan, } + logger.Info("initialized") globalReactor.wg.Add(1) go globalReactor.run() logger.Info("started") @@ -63,7 +65,9 @@ func Stop() { if globalReactor != nil { globalReactor.cancel() globalReactor.wg.Wait() - close(globalReactor.output) + close(globalReactor.input) + close(globalReactor.tokenPool) + once = sync.Once{} logger.Info("stopped") } } diff --git a/internal/pkg/reactor/reactor_test.go b/internal/pkg/reactor/reactor_test.go index c7482a19..f995de83 100644 --- a/internal/pkg/reactor/reactor_test.go +++ b/internal/pkg/reactor/reactor_test.go @@ -6,6 +6,7 @@ import ( "time" "github.com/google/uuid" + "github.com/internetarchive/Zeno/internal/pkg/log" "github.com/internetarchive/Zeno/pkg/models" ) @@ -22,33 +23,32 @@ func TestReactor_E2E_Unbalanced_MoreTokens(t *testing.T) { } func TestReactor_E2E_BalancedBig(t *testing.T) { - _testerFunc(5000, 5000, 1000000, t) + _testerFunc(5000, 5000, 100000, t) } func TestReactor_E2E_UnbalancedBig_MoreConsumers(t *testing.T) { - _testerFunc(50, 5000, 1000000, t) + _testerFunc(50, 5000, 100000, t) } func TestReactor_E2E_UnbalancedBig_MoreTokens(t *testing.T) { - _testerFunc(50, 5000, 1000000, t) + _testerFunc(5000, 50, 100000, t) } func _testerFunc(tokens, consumers, seeds int, t testing.TB) { // Initialize the reactor with a maximum of 5 tokens outputChan := make(chan *models.Item) - err := Start(1, outputChan) + err := Start(tokens, outputChan) if err != nil { t.Logf("Error starting reactor: %s", err) return } - defer Stop() // Channel to collect errors from goroutines - errorChan := make(chan error) + fatalChan := make(chan error, consumers) // Consume items from the output channel, start 5 goroutines - for i := 0; i < 5; i++ { + for i := 0; i < consumers; i++ { go func() { for { select { @@ -61,7 +61,7 @@ func _testerFunc(tokens, consumers, seeds int, t testing.TB) { if item.Source != models.ItemSourceFeedback { err := ReceiveFeedback(item) if err != nil { - errorChan <- fmt.Errorf("Error sending feedback: %s - %s", err, item.UUID.String()) + fatalChan <- fmt.Errorf("Error sending feedback: %s - %s", err, item.UUID.String()) } continue } @@ -70,7 +70,7 @@ func _testerFunc(tokens, consumers, seeds int, t testing.TB) { if item.Source == models.ItemSourceFeedback { err := MarkAsFinished(item) if err != nil { - errorChan <- fmt.Errorf("Error marking item as finished: %s", err) + fatalChan <- fmt.Errorf("Error marking item as finished: %s", err) } continue } @@ -79,16 +79,9 @@ func _testerFunc(tokens, consumers, seeds int, t testing.TB) { }() } - // Handle errors from goroutines - go func() { - for err := range errorChan { - t.Error(err) - } - }() - // Create mock seeds mockItems := []*models.Item{} - for i := 0; i <= 1000; i++ { + for i := 0; i <= seeds; i++ { uuid := uuid.New() mockItems = append(mockItems, &models.Item{ UUID: &uuid, @@ -102,20 +95,34 @@ func _testerFunc(tokens, consumers, seeds int, t testing.TB) { for _, seed := range mockItems { err := ReceiveInsert(seed) if err != nil { - t.Fatalf("Error queuing seed to source channel: %s", err) + Stop() + log.Stop() + t.Errorf("Error queuing seed to source channel: %s", err) + return } } // Allow some time for processing for { select { + case err := <-fatalChan: + Stop() + log.Stop() + t.Errorf("Received error while processing %s", err) + return case <-time.After(5 * time.Second): + Stop() + log.Stop() if len(GetStateTable()) > 0 { - t.Fatalf("State table is not empty: %s", GetStateTable()) + t.Errorf("State table is not empty: %s", GetStateTable()) + return } - t.Fatalf("Timeout waiting for reactor to finish processing") + t.Errorf("Timeout waiting for reactor to finish processing") + return default: if len(GetStateTable()) == 0 { + Stop() + log.Stop() return } } diff --git a/main.go b/main.go index 80418fe4..4b95d921 100644 --- a/main.go +++ b/main.go @@ -25,11 +25,11 @@ var ( ) func main() { - log.Init() + log.Start() logger = log.NewFieldedLogger(&log.Fields{ "component": "preprocessor", }) - defer log.Shutdown() + defer log.Stop() if err := cmd.Run(); err != nil { fmt.Println(err) From b7757e33684d1a8e81f1228cda2e265c41f73193 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Tue, 19 Nov 2024 17:47:50 +0100 Subject: [PATCH 030/295] add: seencheck in preprocessing --- .old/internal/pkg/seencheck/seencheck.go | 62 ---------- go.mod | 5 + go.sum | 26 +++++ internal/pkg/preprocessor/preprocessor.go | 57 ++++++++- .../pkg/preprocessor/seencheck/seencheck.go | 68 +++++++++++ internal/pkg/source/{ => hq}/hq.go | 109 +----------------- internal/pkg/source/hq/seencheck.go | 40 +++++++ internal/pkg/source/hq/websocket.go | 42 +++++++ internal/pkg/source/source.go | 6 - main.go | 2 - pkg/models/url.go | 4 - pkg/models/urltype.go | 19 --- 12 files changed, 239 insertions(+), 201 deletions(-) delete mode 100644 .old/internal/pkg/seencheck/seencheck.go create mode 100644 internal/pkg/preprocessor/seencheck/seencheck.go rename internal/pkg/source/{ => hq}/hq.go (72%) create mode 100644 internal/pkg/source/hq/seencheck.go create mode 100644 internal/pkg/source/hq/websocket.go delete mode 100644 pkg/models/urltype.go diff --git a/.old/internal/pkg/seencheck/seencheck.go b/.old/internal/pkg/seencheck/seencheck.go deleted file mode 100644 index 1a2fc764..00000000 --- a/.old/internal/pkg/seencheck/seencheck.go +++ /dev/null @@ -1,62 +0,0 @@ -package seencheck - -import ( - "hash/fnv" - "path" - "strconv" - "sync/atomic" - - "github.com/philippgille/gokv/leveldb" -) - -// Seencheck holds the Seencheck database and the seen counter -type Seencheck struct { - Count *int64 - DB leveldb.Store -} - -func New(jobPath string) (seencheck *Seencheck, err error) { - seencheck = new(Seencheck) - count := int64(0) - seencheck.Count = &count - seencheck.DB, err = leveldb.NewStore(leveldb.Options{Path: path.Join(jobPath, "seencheck")}) - if err != nil { - return seencheck, err - } - - return seencheck, nil -} - -func (seencheck *Seencheck) Close() { - seencheck.DB.Close() -} - -// IsSeen check if the hash is in the seencheck database -func (seencheck *Seencheck) IsSeen(hash string) (found bool, value string) { - found, err := seencheck.DB.Get(hash, &value) - if err != nil { - panic(err) - } - - return found, value -} - -// Seen mark a hash as seen and increment the seen counter -func (seencheck *Seencheck) Seen(hash, value string) { - seencheck.DB.Set(hash, value) - atomic.AddInt64(seencheck.Count, 1) -} - -func (seencheck *Seencheck) SeencheckURL(URL string, URLType string) bool { - h := fnv.New64a() - h.Write([]byte(URL)) - hash := strconv.FormatUint(h.Sum64(), 10) - - found, _ := seencheck.IsSeen(hash) - if found { - return true - } else { - seencheck.Seen(hash, URLType) - return false - } -} diff --git a/go.mod b/go.mod index 6dcb22b9..0b59d61b 100644 --- a/go.mod +++ b/go.mod @@ -20,18 +20,23 @@ require ( github.com/gobwas/httphead v0.1.0 // indirect github.com/gobwas/pool v0.2.1 // indirect github.com/gobwas/ws v1.4.0 // indirect + github.com/golang/snappy v0.0.1 // indirect github.com/google/go-cmp v0.6.0 // indirect github.com/hashicorp/hcl v1.0.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/magiconair/properties v1.8.7 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect github.com/pelletier/go-toml/v2 v2.2.2 // indirect + github.com/philippgille/gokv/encoding v0.7.0 // indirect + github.com/philippgille/gokv/leveldb v0.7.0 // indirect + github.com/philippgille/gokv/util v0.7.0 // indirect github.com/sagikazarmark/locafero v0.4.0 // indirect github.com/sagikazarmark/slog-shim v0.1.0 // indirect github.com/sourcegraph/conc v0.3.0 // indirect github.com/spf13/afero v1.11.0 // indirect github.com/spf13/cast v1.6.0 // indirect github.com/subosito/gotenv v1.6.0 // indirect + github.com/syndtr/goleveldb v1.0.0 // indirect go.uber.org/atomic v1.9.0 // indirect go.uber.org/multierr v1.9.0 // indirect golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect diff --git a/go.sum b/go.sum index 5de95c8c..44037eae 100644 --- a/go.sum +++ b/go.sum @@ -11,6 +11,7 @@ github.com/elastic/go-elasticsearch/v7 v7.17.10 h1:TCQ8i4PmIJuBunvBS6bwT2ybzVFxx github.com/elastic/go-elasticsearch/v7 v7.17.10/go.mod h1:OJ4wdbtDNk5g503kvlHLyErCgQwwzmDtaFC4XyOxXA4= github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= +github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU= @@ -19,12 +20,17 @@ github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og= github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw= github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs= github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc= +github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4= +github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= +github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/internetarchive/gocrawlhq v1.2.20 h1:0mIIt9lhPacKr6L2JeISoopQ8EgzC3dISJ3ITGGbOp4= @@ -37,8 +43,19 @@ github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0V github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= +github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= +github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= +github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= github.com/pelletier/go-toml/v2 v2.2.2 h1:aYUidT7k73Pcl9nb2gScu7NSrKCSHIDE89b3+6Wq+LM= github.com/pelletier/go-toml/v2 v2.2.2/go.mod h1:1t835xjRzz80PqgE6HHgN2JOsmgYu/h4qDAS4n929Rs= +github.com/philippgille/gokv v0.7.0 h1:rQSIQspete82h78Br7k7rKUZ8JYy/hWlwzm/W5qobPI= +github.com/philippgille/gokv v0.7.0/go.mod h1:OwiTP/3bhEBhSuOmFmq1+rszglfSgjJVxd1HOgOa2N4= +github.com/philippgille/gokv/encoding v0.7.0 h1:2oxepKzzTsi00iLZBCZ7Rmqrallh9zws3iqSrLGfkgo= +github.com/philippgille/gokv/encoding v0.7.0/go.mod h1:yncOBBUciyniPI8t5ECF8XSCwhONE9Rjf3My5IHs3fA= +github.com/philippgille/gokv/leveldb v0.7.0 h1:QTH83utBG8knTTFzO1tIF1amKIjz9xxOPLaZrU48kdQ= +github.com/philippgille/gokv/leveldb v0.7.0/go.mod h1:EE/vyZ5VwPPWwZHKJYWd/rkqUIJXFykKA5eluazFByc= +github.com/philippgille/gokv/util v0.7.0 h1:5avUK/a3aSj/aWjhHv4/FkqgMon2B7k2BqFgLcR+DYg= +github.com/philippgille/gokv/util v0.7.0/go.mod h1:i9KLHbPxGiHLMhkix/CcDQhpPbCkJy5BkW+RKgwDHMo= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= @@ -73,6 +90,8 @@ github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsT github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8= github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU= +github.com/syndtr/goleveldb v1.0.0 h1:fBdIW9lB4Iz0n9khmH8w27SJ3QEJ7+IgjPEwGSZiFdE= +github.com/syndtr/goleveldb v1.0.0/go.mod h1:ZVVdQEZoIme9iO1Ch2Jdy24qqXrMMOU6lpPAyBWyWuQ= go.uber.org/atomic v1.9.0 h1:ECmE8Bn/WFTYwEW/bpKD3M8VtR/zQVbavAoalC1PYyE= go.uber.org/atomic v1.9.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= @@ -81,18 +100,25 @@ go.uber.org/multierr v1.9.0 h1:7fIwc/ZtS0q++VgcfqFDxSBZVv/Xo49/SYnDFupUwlI= go.uber.org/multierr v1.9.0/go.mod h1:X2jQV1h+kxSjClGpnseKVIxpmcjrj7MNnI0bnlfKTVQ= golang.org/x/exp v0.0.0-20230905200255-921286631fa9 h1:GoHiUyI/Tp2nVkLI2mCxVkOjsbSXD66ic0XW0js0R9g= golang.org/x/exp v0.0.0-20230905200255-921286631fa9/go.mod h1:S2oDrQGGwySpoQPVqRShND87VCbxmc6bL1Yd2oYrm6k= +golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.31.0 h1:68CPQngjLL0r2AlUKiSxtQFKvzRVbnzLwMUn5SzcLHo= golang.org/x/net v0.31.0/go.mod h1:P4fl1q7dY2hnZFxEk4pPSkDHF+QqjitcnDjUQyMM+pM= +golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s= golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug= golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= +gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go index bf5e5f44..46a5ae7e 100644 --- a/internal/pkg/preprocessor/preprocessor.go +++ b/internal/pkg/preprocessor/preprocessor.go @@ -6,6 +6,8 @@ import ( "github.com/internetarchive/Zeno/internal/pkg/config" "github.com/internetarchive/Zeno/internal/pkg/log" + "github.com/internetarchive/Zeno/internal/pkg/preprocessor/seencheck" + "github.com/internetarchive/Zeno/internal/pkg/source/hq" "github.com/internetarchive/Zeno/pkg/models" ) @@ -93,16 +95,32 @@ func (p *preprocessor) run() { func (p *preprocessor) preprocess(item *models.Item) { // Validate the URL of either the item itself and/or its childs - var err error + // TODO: if an error happen and it's a fresh item, we should mark it as failed in HQ (if it's a HQ-based crawl) + + var ( + err error + URLsToSeencheck []*models.URL + URLType string + ) + + // Validate the URLs, either the item's URL or its childs if it has any if item.Status == models.ItemFresh { - // Preprocess the item's URL itself + URLType = "seed" + + // Validate the item's URL itself err = validateURL(item.URL, nil) if err != nil { logger.Warn("unable to validate URL", "url", item.URL.Raw, "err", err.Error(), "func", "preprocessor.preprocess") return } + + if config.Get().UseSeencheck { + URLsToSeencheck = append(URLsToSeencheck, item.URL) + } } else if len(item.Childs) > 0 { - // Preprocess the childs + URLType = "asset" + + // Validate the URLs of the child items for i := 0; i < len(item.Childs); { err = validateURL(item.Childs[i], item.URL) if err != nil { @@ -110,6 +128,10 @@ func (p *preprocessor) preprocess(item *models.Item) { logger.Warn("unable to validate URL", "url", item.Childs[i].Raw, "err", err.Error(), "func", "preprocessor.preprocess") item.Childs = append(item.Childs[:i], item.Childs[i+1:]...) } else { + if config.Get().UseSeencheck { + URLsToSeencheck = append(URLsToSeencheck, item.Childs[i]) + } + i++ } } @@ -117,6 +139,35 @@ func (p *preprocessor) preprocess(item *models.Item) { logger.Error("item got into preprocessing without anything to preprocess") } + // If we have URLs to seencheck, we do it + if len(URLsToSeencheck) > 0 { + var seencheckedURLs []*models.URL + + if config.Get().HQ { + seencheckedURLs, err = hq.SeencheckURLs(URLType, item.URL) + if err != nil { + logger.Warn("unable to seencheck URL", "url", item.URL.Raw, "err", err.Error(), "func", "preprocessor.preprocess") + return + } + } else { + seencheckedURLs, err = seencheck.SeencheckURLs(URLType, item.URL) + if err != nil { + logger.Warn("unable to seencheck URL", "url", item.URL.Raw, "err", err.Error(), "func", "preprocessor.preprocess") + return + } + } + + if len(seencheckedURLs) == 0 { + return + } + + if URLType == "seed" { + item.URL = seencheckedURLs[0] + } else { + item.Childs = seencheckedURLs + } + } + // Final step, send the preprocessed item to the output chan of the preprocessor p.output <- item } diff --git a/internal/pkg/preprocessor/seencheck/seencheck.go b/internal/pkg/preprocessor/seencheck/seencheck.go new file mode 100644 index 00000000..08c45b9f --- /dev/null +++ b/internal/pkg/preprocessor/seencheck/seencheck.go @@ -0,0 +1,68 @@ +package seencheck + +import ( + "hash/fnv" + "path" + "strconv" + "sync/atomic" + + "github.com/internetarchive/Zeno/pkg/models" + "github.com/philippgille/gokv/leveldb" +) + +// Seencheck holds the Seencheck database and the seen counter +type Seencheck struct { + Count *int64 + DB leveldb.Store +} + +var ( + globalSeencheck *Seencheck +) + +func Start(jobPath string) (err error) { + count := int64(0) + globalSeencheck = new(Seencheck) + globalSeencheck.Count = &count + globalSeencheck.DB, err = leveldb.NewStore(leveldb.Options{Path: path.Join(jobPath, "seencheck")}) + return err +} + +func Close() { + globalSeencheck.DB.Close() +} + +func isSeen(hash string) (found bool, value string) { + found, err := globalSeencheck.DB.Get(hash, &value) + if err != nil { + panic(err) + } + + return found, value +} + +func seen(hash, value string) { + globalSeencheck.DB.Set(hash, value) + atomic.AddInt64(globalSeencheck.Count, 1) +} + +func SeencheckURLs(URLType string, URLs ...*models.URL) (seencheckedURLs []*models.URL, err error) { + h := fnv.New64a() + + for _, URL := range URLs { + _, err = h.Write([]byte(URL.String())) + if err != nil { + return nil, err + } + + hash := strconv.FormatUint(h.Sum64(), 10) + + found, _ := isSeen(hash) + if !found { + seen(hash, URLType) + seencheckedURLs = append(seencheckedURLs, URL) + } + } + + return seencheckedURLs, nil +} diff --git a/internal/pkg/source/hq.go b/internal/pkg/source/hq/hq.go similarity index 72% rename from internal/pkg/source/hq.go rename to internal/pkg/source/hq/hq.go index 2add78f0..2c20af97 100644 --- a/internal/pkg/source/hq.go +++ b/internal/pkg/source/hq/hq.go @@ -1,113 +1,12 @@ -package source +package hq import ( - "log/slog" - "time" - - "github.com/internetarchive/Zeno/internal/pkg/config" - "github.com/internetarchive/Zeno/internal/pkg/utils" - "github.com/internetarchive/Zeno/pkg/models" "github.com/internetarchive/gocrawlhq" ) -// This function connects to HQ's websocket and listen for messages. -// It also sends and "identify" message to the HQ to let it know that -// Zeno is connected. This "identify" message is sent every second and -// contains the crawler's stats and details. -func HQWebsocket() { - var identifyTicker = time.NewTicker(time.Second) - - defer func() { - identifyTicker.Stop() - }() - - for { - err := HQClient.Identify(&gocrawlhq.IdentifyMessage{ - Project: config.Get().HQProject, - Job: config.Get().Job, - IP: utils.GetOutboundIP().String(), - Hostname: utils.GetHostname(), - GoVersion: utils.GetVersion().GoVersion, - }) - if err != nil { - slog.Error("error sending identify payload to Crawl HQ, trying to reconnect", "err", err.Error()) - - err = HQClient.InitWebsocketConn() - if err != nil { - slog.Error("error initializing websocket connection to crawl HQ", "err", err.Error()) - } - } - - <-identifyTicker.C - } -} - -// func HQSeencheckURLs(URLs []*url.URL) (seencheckedBatch []*url.URL, err error) { -// var ( -// discoveredURLs []gocrawlhq.URL -// ) - -// for _, URL := range URLs { -// discoveredURLs = append(discoveredURLs, gocrawlhq.URL{ -// Value: utils.URLToString(URL), -// Type: "asset", -// }) -// } - -// outputURLs, err := HQClient.Seencheck(discoveredURLs) -// if err != nil { -// c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ -// "batchLen": len(URLs), -// "urls": discoveredURLs, -// })).Error("error sending seencheck payload to crawl HQ") -// return seencheckedBatch, err -// } - -// if outputURLs != nil { -// for _, URL := range outputURLs { -// // the returned payload only contain new URLs to be crawled by Zeno -// newURL, err := url.Parse(URL.Value) -// if err != nil { -// c.Log.WithFields(c.genLogFields(err, URL, map[string]interface{}{ -// "batchLen": len(URLs), -// })).Error("error parsing URL from HQ seencheck response") -// return seencheckedBatch, err -// } - -// seencheckedBatch = append(seencheckedBatch, newURL) -// } -// } - -// return seencheckedBatch, nil -// } - -// HQSeencheckURL uses Crawl HQ to determine if an URL has been seen before -// in the current crawl project. If the URL is new, it returns true, otherwise -// it returns false. If there's an error sending the payload to Crawl HQ, it -// returns an error and the URL is considered new. -func HQSeencheck(URL *models.URL) (bool, error) { - discoveredURL := gocrawlhq.URL{ - Value: URL.String(), - Type: URL.Type(), - } - - outputURLs, err := HQClient.Seencheck([]gocrawlhq.URL{discoveredURL}) - if err != nil { - slog.Error("error sending seencheck payload to crawl HQ", "err", err, "url", URL.String()) - return true, err // return true, don't discard the URL if there's an error - } - - if outputURLs != nil { - for _, URL := range outputURLs { - // the returned payload only contain new URLs to be crawled by Zeno - if URL.Value == discoveredURL.Value { - return true, nil - } - } - } - - return false, nil -} +var ( + HQClient *gocrawlhq.Client +) // func HQProducer() { // defer c.HQChannelsWg.Done() diff --git a/internal/pkg/source/hq/seencheck.go b/internal/pkg/source/hq/seencheck.go new file mode 100644 index 00000000..bafd0554 --- /dev/null +++ b/internal/pkg/source/hq/seencheck.go @@ -0,0 +1,40 @@ +package hq + +import ( + "log/slog" + + "github.com/internetarchive/Zeno/pkg/models" + "github.com/internetarchive/gocrawlhq" +) + +func SeencheckURLs(URLsType string, URLs ...*models.URL) (seencheckedURLs []*models.URL, err error) { + var ( + discoveredURLs []gocrawlhq.URL + ) + + for _, URL := range URLs { + discoveredURLs = append(discoveredURLs, gocrawlhq.URL{ + Value: URL.String(), + Type: URLsType, + }) + } + + outputURLs, err := HQClient.Seencheck(discoveredURLs) + if err != nil { + slog.Error("error sending seencheck payload to crawl HQ", "err", err.Error()) + return URLs, err + } + + if outputURLs != nil { + for _, URL := range URLs { + for _, outputURL := range outputURLs { + if URL.String() == outputURL.Value { + seencheckedURLs = append(seencheckedURLs, URL) + break + } + } + } + } + + return seencheckedURLs, nil +} diff --git a/internal/pkg/source/hq/websocket.go b/internal/pkg/source/hq/websocket.go new file mode 100644 index 00000000..12be802c --- /dev/null +++ b/internal/pkg/source/hq/websocket.go @@ -0,0 +1,42 @@ +package hq + +import ( + "log/slog" + "time" + + "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/internetarchive/Zeno/internal/pkg/utils" + "github.com/internetarchive/gocrawlhq" +) + +// This function connects to HQ's websocket and listen for messages. +// It also sends and "identify" message to the HQ to let it know that +// Zeno is connected. This "identify" message is sent every second and +// contains the crawler's stats and details. +func HQWebsocket() { + var identifyTicker = time.NewTicker(time.Second) + + defer func() { + identifyTicker.Stop() + }() + + for { + err := HQClient.Identify(&gocrawlhq.IdentifyMessage{ + Project: config.Get().HQProject, + Job: config.Get().Job, + IP: utils.GetOutboundIP().String(), + Hostname: utils.GetHostname(), + GoVersion: utils.GetVersion().GoVersion, + }) + if err != nil { + slog.Error("error sending identify payload to Crawl HQ, trying to reconnect", "err", err.Error()) + + err = HQClient.InitWebsocketConn() + if err != nil { + slog.Error("error initializing websocket connection to crawl HQ", "err", err.Error()) + } + } + + <-identifyTicker.C + } +} diff --git a/internal/pkg/source/source.go b/internal/pkg/source/source.go index beb16e87..d150341c 100644 --- a/internal/pkg/source/source.go +++ b/internal/pkg/source/source.go @@ -1,7 +1 @@ package source - -import "github.com/internetarchive/gocrawlhq" - -var ( - HQClient *gocrawlhq.Client -) diff --git a/main.go b/main.go index 4b95d921..e8683a30 100644 --- a/main.go +++ b/main.go @@ -36,8 +36,6 @@ func main() { os.Exit(1) } - fmt.Printf("%+v\n", config.Get()) - // Start the reactor that will receive reactorOutputChan := make(chan *models.Item) err := reactor.Start(config.Get().WorkersCount, reactorOutputChan) diff --git a/pkg/models/url.go b/pkg/models/url.go index a5b0aefd..5d3408db 100644 --- a/pkg/models/url.go +++ b/pkg/models/url.go @@ -24,10 +24,6 @@ func (u *URL) Parsed() *url.URL { return u.parsed } -func (u *URL) Type() string { - return u.String() -} - func (u *URL) Hop() int { return u.hop } diff --git a/pkg/models/urltype.go b/pkg/models/urltype.go deleted file mode 100644 index 3e334801..00000000 --- a/pkg/models/urltype.go +++ /dev/null @@ -1,19 +0,0 @@ -package models - -type URLType int64 - -const ( - Seed URLType = iota - Asset -) - -func (t URLType) String() string { - switch t { - case Seed: - return "seed" - case Asset: - return "asset" - } - - return "" -} From 60a9e375999fd02606d9f80481b2e3bb576f7179 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Tue, 19 Nov 2024 18:53:36 +0100 Subject: [PATCH 031/295] add: archiver + postprocessor --- go.mod | 12 ++ go.sum | 24 ++++ internal/pkg/archiver/capture.go | 143 ++++++++++++++++++++ internal/pkg/archiver/error.go | 8 ++ internal/pkg/postprocessor/error.go | 8 ++ internal/pkg/postprocessor/postprocessor.go | 97 +++++++++++++ internal/pkg/preprocessor/README.md | 12 +- internal/pkg/preprocessor/error.go | 9 -- internal/pkg/preprocessor/preprocessor.go | 48 +++++-- internal/pkg/source/hq/websocket.go | 2 +- internal/pkg/utils/url.go | 2 +- main.go | 18 +++ pkg/models/url.go | 20 ++- 13 files changed, 369 insertions(+), 34 deletions(-) create mode 100644 internal/pkg/archiver/capture.go create mode 100644 internal/pkg/archiver/error.go create mode 100644 internal/pkg/postprocessor/error.go create mode 100644 internal/pkg/postprocessor/postprocessor.go diff --git a/go.mod b/go.mod index 0b59d61b..e6497dc4 100644 --- a/go.mod +++ b/go.mod @@ -16,6 +16,9 @@ require ( ) require ( + github.com/CorentinB/warc v0.8.53 // indirect + github.com/andybalholm/brotli v1.1.0 // indirect + github.com/cloudflare/circl v1.4.0 // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect github.com/gobwas/httphead v0.1.0 // indirect github.com/gobwas/pool v0.2.1 // indirect @@ -24,12 +27,16 @@ require ( github.com/google/go-cmp v0.6.0 // indirect github.com/hashicorp/hcl v1.0.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect + github.com/klauspost/compress v1.17.10 // indirect github.com/magiconair/properties v1.8.7 // indirect + github.com/miekg/dns v1.1.62 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect + github.com/paulbellamy/ratecounter v0.2.0 // indirect github.com/pelletier/go-toml/v2 v2.2.2 // indirect github.com/philippgille/gokv/encoding v0.7.0 // indirect github.com/philippgille/gokv/leveldb v0.7.0 // indirect github.com/philippgille/gokv/util v0.7.0 // indirect + github.com/refraction-networking/utls v1.6.7 // indirect github.com/sagikazarmark/locafero v0.4.0 // indirect github.com/sagikazarmark/slog-shim v0.1.0 // indirect github.com/sourcegraph/conc v0.3.0 // indirect @@ -37,11 +44,16 @@ require ( github.com/spf13/cast v1.6.0 // indirect github.com/subosito/gotenv v1.6.0 // indirect github.com/syndtr/goleveldb v1.0.0 // indirect + github.com/ulikunitz/xz v0.5.12 // indirect go.uber.org/atomic v1.9.0 // indirect go.uber.org/multierr v1.9.0 // indirect + golang.org/x/crypto v0.29.0 // indirect golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect + golang.org/x/mod v0.21.0 // indirect + golang.org/x/sync v0.9.0 // indirect golang.org/x/sys v0.27.0 // indirect golang.org/x/text v0.20.0 // indirect + golang.org/x/tools v0.25.0 // indirect gopkg.in/ini.v1 v1.67.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index 44037eae..60f17a93 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,11 @@ +github.com/CorentinB/warc v0.8.53 h1:xVz3RMdZ6faAqTtLfcK1/yl8ZTansy+B2en//EZLUlM= +github.com/CorentinB/warc v0.8.53/go.mod h1:NblONkMtoBB4TIigew6F6vakzu0z3YQTKNFS8U2FIn8= +github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= +github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY= github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 h1:DklsrG3dyBCFEj5IhUbnKptjxatkF07cF2ak3yi77so= github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw= +github.com/cloudflare/circl v1.4.0 h1:BV7h5MgrktNzytKmWjpOtdYrf0lkkbF8YMlBGPhJQrY= +github.com/cloudflare/circl v1.4.0/go.mod h1:PDRU+oXvdD7KCtgKxW95M5Z8BpSCJXQORiZFnBQS5QU= github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -35,17 +41,23 @@ github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2 github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/internetarchive/gocrawlhq v1.2.20 h1:0mIIt9lhPacKr6L2JeISoopQ8EgzC3dISJ3ITGGbOp4= github.com/internetarchive/gocrawlhq v1.2.20/go.mod h1:gHrdMewIi5OBWE/xEZGqSrNHyTXPbt+h+XUWpp9fZek= +github.com/klauspost/compress v1.17.10 h1:oXAz+Vh0PMUvJczoi+flxpnBEPxoER1IaAnU/NMPtT0= +github.com/klauspost/compress v1.17.10/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= +github.com/miekg/dns v1.1.62 h1:cN8OuEF1/x5Rq6Np+h1epln8OiyPWV+lROx9LxcGgIQ= +github.com/miekg/dns v1.1.62/go.mod h1:mvDlcItzm+br7MToIKqkglaGhlFMHJ9DTNNWONWXbNQ= github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= +github.com/paulbellamy/ratecounter v0.2.0 h1:2L/RhJq+HA8gBQImDXtLPrDXK5qAj6ozWVK/zFXVJGs= +github.com/paulbellamy/ratecounter v0.2.0/go.mod h1:Hfx1hDpSGoqxkVVpBi/IlYD7kChlfo5C6hzIHwPqfFE= github.com/pelletier/go-toml/v2 v2.2.2 h1:aYUidT7k73Pcl9nb2gScu7NSrKCSHIDE89b3+6Wq+LM= github.com/pelletier/go-toml/v2 v2.2.2/go.mod h1:1t835xjRzz80PqgE6HHgN2JOsmgYu/h4qDAS4n929Rs= github.com/philippgille/gokv v0.7.0 h1:rQSIQspete82h78Br7k7rKUZ8JYy/hWlwzm/W5qobPI= @@ -59,6 +71,8 @@ github.com/philippgille/gokv/util v0.7.0/go.mod h1:i9KLHbPxGiHLMhkix/CcDQhpPbCkJ github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/refraction-networking/utls v1.6.7 h1:zVJ7sP1dJx/WtVuITug3qYUq034cDq9B2MR1K67ULZM= +github.com/refraction-networking/utls v1.6.7/go.mod h1:BC3O4vQzye5hqpmDTWUqi4P5DDhzJfkV1tdqtawQIH0= github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8= github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= @@ -92,18 +106,26 @@ github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8 github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU= github.com/syndtr/goleveldb v1.0.0 h1:fBdIW9lB4Iz0n9khmH8w27SJ3QEJ7+IgjPEwGSZiFdE= github.com/syndtr/goleveldb v1.0.0/go.mod h1:ZVVdQEZoIme9iO1Ch2Jdy24qqXrMMOU6lpPAyBWyWuQ= +github.com/ulikunitz/xz v0.5.12 h1:37Nm15o69RwBkXM0J6A5OlE67RZTfzUxTj8fB3dfcsc= +github.com/ulikunitz/xz v0.5.12/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14= go.uber.org/atomic v1.9.0 h1:ECmE8Bn/WFTYwEW/bpKD3M8VtR/zQVbavAoalC1PYyE= go.uber.org/atomic v1.9.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.9.0 h1:7fIwc/ZtS0q++VgcfqFDxSBZVv/Xo49/SYnDFupUwlI= go.uber.org/multierr v1.9.0/go.mod h1:X2jQV1h+kxSjClGpnseKVIxpmcjrj7MNnI0bnlfKTVQ= +golang.org/x/crypto v0.29.0 h1:L5SG1JTTXupVV3n6sUqMTeWbjAyfPwoda2DLX8J8FrQ= +golang.org/x/crypto v0.29.0/go.mod h1:+F4F4N5hv6v38hfeYwTdx20oUvLLc+QfrE9Ax9HtgRg= golang.org/x/exp v0.0.0-20230905200255-921286631fa9 h1:GoHiUyI/Tp2nVkLI2mCxVkOjsbSXD66ic0XW0js0R9g= golang.org/x/exp v0.0.0-20230905200255-921286631fa9/go.mod h1:S2oDrQGGwySpoQPVqRShND87VCbxmc6bL1Yd2oYrm6k= +golang.org/x/mod v0.21.0 h1:vvrHzRwRfVKSiLrG+d4FMl/Qi4ukBCE6kZlTUkDYRT0= +golang.org/x/mod v0.21.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.31.0 h1:68CPQngjLL0r2AlUKiSxtQFKvzRVbnzLwMUn5SzcLHo= golang.org/x/net v0.31.0/go.mod h1:P4fl1q7dY2hnZFxEk4pPSkDHF+QqjitcnDjUQyMM+pM= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.9.0 h1:fEo0HyrW1GIgZdpbhCRO0PkJajUS5H9IFUztCgEo2jQ= +golang.org/x/sync v0.9.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s= @@ -111,6 +133,8 @@ golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug= golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4= +golang.org/x/tools v0.25.0 h1:oFU9pkj/iJgs+0DT+VMHrx+oBKs/LJMV+Uvg78sl+fE= +golang.org/x/tools v0.25.0/go.mod h1:/vtpO8WL1N9cQC3FN5zPqb//fRXskFHbLKk4OW1Q7rg= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/internal/pkg/archiver/capture.go b/internal/pkg/archiver/capture.go new file mode 100644 index 00000000..88e8cf26 --- /dev/null +++ b/internal/pkg/archiver/capture.go @@ -0,0 +1,143 @@ +package archiver + +import ( + "context" + "io" + "net/http" + "sync" + + "github.com/CorentinB/warc" + "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/internetarchive/Zeno/internal/pkg/log" + "github.com/internetarchive/Zeno/pkg/models" +) + +type archiver struct { + wg sync.WaitGroup + ctx context.Context + cancel context.CancelFunc + input chan *models.Item + output chan *models.Item + + Client *warc.CustomHTTPClient + ClientWithProxy *warc.CustomHTTPClient +} + +var ( + globalArchiver *archiver + once sync.Once + logger *log.FieldedLogger +) + +// This functions starts the archiver responsible for capturing the URLs +func Start(inputChan, outputChan chan *models.Item) error { + var done bool + + log.Start() + logger = log.NewFieldedLogger(&log.Fields{ + "component": "archiver", + }) + + once.Do(func() { + ctx, cancel := context.WithCancel(context.Background()) + globalArchiver = &archiver{ + ctx: ctx, + cancel: cancel, + input: inputChan, + output: outputChan, + } + globalArchiver.wg.Add(1) + go run() + logger.Info("started") + done = true + }) + + if !done { + return ErrArchiverAlreadyInitialized + } + + return nil +} + +func Stop() { + if globalArchiver != nil { + globalArchiver.cancel() + globalArchiver.wg.Wait() + close(globalArchiver.output) + logger.Info("stopped") + } +} + +func run() { + defer globalArchiver.wg.Done() + + var ( + wg sync.WaitGroup + guard = make(chan struct{}, config.Get().WorkersCount) + ) + + for { + select { + // Closes the run routine when context is canceled + case <-globalArchiver.ctx.Done(): + logger.Info("shutting down") + return + case item, ok := <-globalArchiver.input: + if ok { + guard <- struct{}{} + wg.Add(1) + go func() { + defer wg.Done() + defer func() { <-guard }() + archive(item) + }() + } + } + } +} + +func archive(item *models.Item) { + // TODO: rate limiting handling + + var ( + URLsToCapture []*models.URL + guard = make(chan struct{}, config.Get().MaxConcurrentAssets) + wg *sync.WaitGroup + ) + + // Determines the URLs that need to be captured, if the item's status is fresh we need + // to capture the seed, else we need to capture the child URLs (assets), in parallel + if item.Status == models.ItemFresh { + URLsToCapture = append(URLsToCapture, item.URL) + } else { + URLsToCapture = item.Childs + } + + for _, URL := range URLsToCapture { + guard <- struct{}{} + wg.Add(1) + go func() { + defer wg.Done() + defer func() { <-guard }() + + var ( + err error + resp *http.Response + ) + + if config.Get().Proxy == "" { + resp, err = globalArchiver.ClientWithProxy.Do(URL.GetRequest()) + } else { + resp, err = globalArchiver.Client.Do(URL.GetRequest()) + } + + // For now, we only consume it + _, err = io.Copy(io.Discard, resp.Body) + if err != nil { + logger.Error("unable to consume response body", "url", URL.String(), "err", err.Error(), "func", "archiver.archive") + } + }() + } + + globalArchiver.output <- item +} diff --git a/internal/pkg/archiver/error.go b/internal/pkg/archiver/error.go new file mode 100644 index 00000000..d0af5b7b --- /dev/null +++ b/internal/pkg/archiver/error.go @@ -0,0 +1,8 @@ +package archiver + +import "errors" + +var ( + // ErrArchiverAlreadyInitialized is the error returned when the preprocess is already initialized + ErrArchiverAlreadyInitialized = errors.New("archiver already initialized") +) diff --git a/internal/pkg/postprocessor/error.go b/internal/pkg/postprocessor/error.go new file mode 100644 index 00000000..b9ce6de7 --- /dev/null +++ b/internal/pkg/postprocessor/error.go @@ -0,0 +1,8 @@ +package postprocessor + +import "errors" + +var ( + // ErrPostprocessorAlreadyInitialized is the error returned when the postprocessor is already initialized + ErrPostprocessorAlreadyInitialized = errors.New("postprocessor already initialized") +) diff --git a/internal/pkg/postprocessor/postprocessor.go b/internal/pkg/postprocessor/postprocessor.go new file mode 100644 index 00000000..f5db2eba --- /dev/null +++ b/internal/pkg/postprocessor/postprocessor.go @@ -0,0 +1,97 @@ +package postprocessor + +import ( + "context" + "sync" + + "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/internetarchive/Zeno/internal/pkg/log" + "github.com/internetarchive/Zeno/pkg/models" +) + +type postprocessor struct { + wg sync.WaitGroup + ctx context.Context + cancel context.CancelFunc + input chan *models.Item + output chan *models.Item +} + +var ( + globalPostprocessor *postprocessor + once sync.Once + logger *log.FieldedLogger +) + +// This functions starts the preprocessor responsible for preparing +// the seeds sent by the reactor for captures +func Start(inputChan, outputChan chan *models.Item) error { + var done bool + + log.Start() + logger = log.NewFieldedLogger(&log.Fields{ + "component": "postprocessor", + }) + + once.Do(func() { + ctx, cancel := context.WithCancel(context.Background()) + globalPostprocessor = &postprocessor{ + ctx: ctx, + cancel: cancel, + input: inputChan, + output: outputChan, + } + globalPostprocessor.wg.Add(1) + go run() + logger.Info("started") + done = true + }) + + if !done { + return ErrPostprocessorAlreadyInitialized + } + + return nil +} + +func Stop() { + if globalPostprocessor != nil { + globalPostprocessor.cancel() + globalPostprocessor.wg.Wait() + close(globalPostprocessor.output) + logger.Info("stopped") + } +} + +func run() { + defer globalPostprocessor.wg.Done() + + var ( + wg sync.WaitGroup + guard = make(chan struct{}, config.Get().WorkersCount) + ) + + for { + select { + // Closes the run routine when context is canceled + case <-globalPostprocessor.ctx.Done(): + logger.Info("shutting down") + return + case item, ok := <-globalPostprocessor.input: + if ok { + guard <- struct{}{} + wg.Add(1) + go func() { + defer wg.Done() + defer func() { <-guard }() + postprocess(item) + }() + } + } + } +} + +func postprocess(item *models.Item) { + // TODO + globalPostprocessor.output <- item +} diff --git a/internal/pkg/preprocessor/README.md b/internal/pkg/preprocessor/README.md index db1cb835..e1ac52f6 100644 --- a/internal/pkg/preprocessor/README.md +++ b/internal/pkg/preprocessor/README.md @@ -1,8 +1,8 @@ # Preprocessor Package Documentation ## Overview -The preprocessor package provides functionality to prepare seeds for capture. It includes mechanisms for validating URLs and preprocessing items before they are sent for capture. The package ensures that operations are atomic and synchronized, maintaining consistency and avoiding race conditions. +The preprocessor package provides functionality to prepare seeds for capture. It includes mechanisms for validating URLs and preprocessoring items before they are sent for capture. The package ensures that operations are atomic and synchronized, maintaining consistency and avoiding race conditions. -The preprocessor package is designed to be used in a concurrent environment, where multiple goroutines may interact with the preprocessor. It uses channels to manage the flow of items and their preprocessing status. The package is thread-safe and provides error handling for common scenarios. +The preprocessor package is designed to be used in a concurrent environment, where multiple goroutines may interact with the preprocessor. It uses channels to manage the flow of items and their preprocessoring status. The package is thread-safe and provides error handling for common scenarios. ## Installation To use the preprocessor package, import it into your package: @@ -27,8 +27,8 @@ The initialization should happen once or it will error out with ErrPreprocessorAlreadyInitialized || ErrPreprocessorNotInitialized ``` -### Preprocessing Items -To preprocess an item, send it to the input channel: +### Preprocessoring Items +To preprocessor an item, send it to the input channel: ```go item := &models.Item{ UUID: uuid.New(), @@ -37,11 +37,11 @@ item := &models.Item{ } inputChan <- item ``` -The preprocessed item will be sent to the output channel after preprocessing. +The preprocessored item will be sent to the output channel after preprocessoring. ## Internals ### Preprocessor Struct -The preprocessor struct holds the state and channels for managing item preprocessing: +The preprocessor struct holds the state and channels for managing item preprocessoring: ```go type preprocessor struct { wg sync.WaitGroup diff --git a/internal/pkg/preprocessor/error.go b/internal/pkg/preprocessor/error.go index 53005de9..ca62b09c 100644 --- a/internal/pkg/preprocessor/error.go +++ b/internal/pkg/preprocessor/error.go @@ -5,13 +5,4 @@ import "errors" var ( // ErrPreprocessorAlreadyInitialized is the error returned when the preprocessor is already initialized ErrPreprocessorAlreadyInitialized = errors.New("preprocessor already initialized") - // ErrPreprocessorNotInitialized is the error returned when the preprocessor is not initialized - ErrPreprocessorNotInitialized = errors.New("preprocessor not initialized") - // ErrPreprocessorShuttingDown is the error returned when the preprocessor is shutting down - ErrPreprocessorShuttingDown = errors.New("preprocessor shutting down") - - // ErrFeedbackItemNotPresent is the error returned when an item was sent to the feedback channel but not found in the state table - ErrFeedbackItemNotPresent = errors.New("feedback item not present in state table") - // ErrFinisehdItemNotFound is the error returned when an item been marked as finished but not found in the state table - ErrFinisehdItemNotFound = errors.New("markAsFinished item not present in state table") ) diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go index 46a5ae7e..fa6c1556 100644 --- a/internal/pkg/preprocessor/preprocessor.go +++ b/internal/pkg/preprocessor/preprocessor.go @@ -2,6 +2,7 @@ package preprocessor import ( "context" + "net/http" "sync" "github.com/internetarchive/Zeno/internal/pkg/config" @@ -44,7 +45,7 @@ func Start(inputChan, outputChan chan *models.Item) error { output: outputChan, } globalPreprocessor.wg.Add(1) - go globalPreprocessor.run() + go run() logger.Info("started") done = true }) @@ -65,8 +66,8 @@ func Stop() { } } -func (p *preprocessor) run() { - defer p.wg.Done() +func run() { + defer globalPreprocessor.wg.Done() var ( wg sync.WaitGroup @@ -76,24 +77,24 @@ func (p *preprocessor) run() { for { select { // Closes the run routine when context is canceled - case <-p.ctx.Done(): + case <-globalPreprocessor.ctx.Done(): logger.Info("shutting down") return - case item, ok := <-p.input: + case item, ok := <-globalPreprocessor.input: if ok { guard <- struct{}{} wg.Add(1) go func() { defer wg.Done() defer func() { <-guard }() - p.preprocess(item) + preprocess(item) }() } } } } -func (p *preprocessor) preprocess(item *models.Item) { +func preprocess(item *models.Item) { // Validate the URL of either the item itself and/or its childs // TODO: if an error happen and it's a fresh item, we should mark it as failed in HQ (if it's a HQ-based crawl) @@ -110,7 +111,7 @@ func (p *preprocessor) preprocess(item *models.Item) { // Validate the item's URL itself err = validateURL(item.URL, nil) if err != nil { - logger.Warn("unable to validate URL", "url", item.URL.Raw, "err", err.Error(), "func", "preprocessor.preprocess") + logger.Warn("unable to validate URL", "url", item.URL.Raw, "err", err.Error(), "func", "preprocessor.preprocessor") return } @@ -125,7 +126,7 @@ func (p *preprocessor) preprocess(item *models.Item) { err = validateURL(item.Childs[i], item.URL) if err != nil { // If we can't validate an URL, we remove it from the list of childs - logger.Warn("unable to validate URL", "url", item.Childs[i].Raw, "err", err.Error(), "func", "preprocessor.preprocess") + logger.Warn("unable to validate URL", "url", item.Childs[i].Raw, "err", err.Error(), "func", "preprocessor.preprocessor") item.Childs = append(item.Childs[:i], item.Childs[i+1:]...) } else { if config.Get().UseSeencheck { @@ -136,7 +137,7 @@ func (p *preprocessor) preprocess(item *models.Item) { } } } else { - logger.Error("item got into preprocessing without anything to preprocess") + logger.Error("item got into preprocessoring without anything to preprocessor") } // If we have URLs to seencheck, we do it @@ -168,6 +169,29 @@ func (p *preprocessor) preprocess(item *models.Item) { } } - // Final step, send the preprocessed item to the output chan of the preprocessor - p.output <- item + // Finally, we build the requests, applying any site-specific behavior needed + if URLType == "seed" { + // TODO: apply site-specific stuff + req, err := http.NewRequest(http.MethodGet, item.URL.String(), nil) + if err != nil { + logger.Error("unable to create new request for URL", "url", item.URL.String(), "err", err.Error(), "func", "preprocessor.preprocess") + return + } + + item.URL.SetRequest(req) + } else { + for i, child := range item.Childs { + // TODO: apply site-specific stuff + req, err := http.NewRequest(http.MethodGet, child.String(), nil) + if err != nil { + logger.Error("unable to create new request for URL", "url", item.URL.String(), "err", err.Error(), "func", "preprocessor.preprocess") + return + } + + item.Childs[i].SetRequest(req) + } + } + + // Final step, send the preprocessored item to the output chan of the preprocessor + globalPreprocessor.output <- item } diff --git a/internal/pkg/source/hq/websocket.go b/internal/pkg/source/hq/websocket.go index 12be802c..eda4d5b5 100644 --- a/internal/pkg/source/hq/websocket.go +++ b/internal/pkg/source/hq/websocket.go @@ -13,7 +13,7 @@ import ( // It also sends and "identify" message to the HQ to let it know that // Zeno is connected. This "identify" message is sent every second and // contains the crawler's stats and details. -func HQWebsocket() { +func Websocket() { var identifyTicker = time.NewTicker(time.Second) defer func() { diff --git a/internal/pkg/utils/url.go b/internal/pkg/utils/url.go index 894cc122..31b0b267 100644 --- a/internal/pkg/utils/url.go +++ b/internal/pkg/utils/url.go @@ -13,7 +13,7 @@ func DedupeURLs(URLs []*models.URL) []*models.URL { if _, value := keys[entry.String()]; !value { keys[entry.String()] = true - if entry.Parsed().Scheme == "http" || entry.Parsed().Scheme == "https" { + if entry.GetParsed().Scheme == "http" || entry.GetParsed().Scheme == "https" { list = append(list, entry) } } diff --git a/main.go b/main.go index e8683a30..d3ff11ea 100644 --- a/main.go +++ b/main.go @@ -13,8 +13,10 @@ import ( "os" "github.com/internetarchive/Zeno/cmd" + "github.com/internetarchive/Zeno/internal/pkg/archiver" "github.com/internetarchive/Zeno/internal/pkg/config" "github.com/internetarchive/Zeno/internal/pkg/log" + "github.com/internetarchive/Zeno/internal/pkg/postprocessor" "github.com/internetarchive/Zeno/internal/pkg/preprocessor" "github.com/internetarchive/Zeno/internal/pkg/reactor" "github.com/internetarchive/Zeno/pkg/models" @@ -52,4 +54,20 @@ func main() { return } defer preprocessor.Stop() + + archiverOutputChan := make(chan *models.Item) + err = archiver.Start(preprocessorOutputChan, archiverOutputChan) + if err != nil { + logger.Error("error starting archiver", "err", err.Error()) + return + } + defer archiver.Stop() + + postprocessorOutputChan := make(chan *models.Item) + err = postprocessor.Start(archiverOutputChan, postprocessorOutputChan) + if err != nil { + logger.Error("error starting postprocessor", "err", err.Error()) + return + } + defer postprocessor.Stop() } diff --git a/pkg/models/url.go b/pkg/models/url.go index 5d3408db..3e2eab47 100644 --- a/pkg/models/url.go +++ b/pkg/models/url.go @@ -3,6 +3,7 @@ package models import ( "log/slog" "net" + "net/http" "net/url" "strings" @@ -10,9 +11,10 @@ import ( ) type URL struct { - Raw string - parsed *url.URL - hop int // This determines the number of hops this item is the result of, a hop is a "jump" from 1 page to another page + Raw string + parsed *url.URL + request *http.Request + hop int // This determines the number of hops this item is the result of, a hop is a "jump" from 1 page to another page } func (u *URL) Parse() (err error) { @@ -20,11 +22,19 @@ func (u *URL) Parse() (err error) { return err } -func (u *URL) Parsed() *url.URL { +func (u *URL) SetRequest(r *http.Request) { + u.request = r +} + +func (u *URL) GetRequest() *http.Request { + return u.request +} + +func (u *URL) GetParsed() *url.URL { return u.parsed } -func (u *URL) Hop() int { +func (u *URL) GetHop() int { return u.hop } From 9112fbb99c91caa3a7b0e101144e702153d8043d Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Tue, 19 Nov 2024 19:08:14 +0100 Subject: [PATCH 032/295] stats: add counter struct and add routines stats and methods --- go.mod | 6 +-- go.sum | 14 ++++++- internal/pkg/stats/counter.go | 23 +++++++++++ internal/pkg/stats/methods.go | 75 +++++++++++++++++++++++++++++++++++ internal/pkg/stats/rate.go | 6 +++ internal/pkg/stats/stats.go | 24 +++++++++-- 6 files changed, 138 insertions(+), 10 deletions(-) create mode 100644 internal/pkg/stats/counter.go create mode 100644 internal/pkg/stats/methods.go diff --git a/go.mod b/go.mod index e6497dc4..8045fe1d 100644 --- a/go.mod +++ b/go.mod @@ -3,11 +3,12 @@ module github.com/internetarchive/Zeno go 1.23.3 require ( - github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 + github.com/CorentinB/warc v0.8.53 github.com/elastic/go-elasticsearch v0.0.0 github.com/elastic/go-elasticsearch/v7 v7.17.10 github.com/google/uuid v1.6.0 github.com/internetarchive/gocrawlhq v1.2.20 + github.com/philippgille/gokv/leveldb v0.7.0 github.com/spf13/cobra v1.8.1 github.com/spf13/pflag v1.0.5 github.com/spf13/viper v1.19.0 @@ -16,7 +17,6 @@ require ( ) require ( - github.com/CorentinB/warc v0.8.53 // indirect github.com/andybalholm/brotli v1.1.0 // indirect github.com/cloudflare/circl v1.4.0 // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect @@ -24,7 +24,6 @@ require ( github.com/gobwas/pool v0.2.1 // indirect github.com/gobwas/ws v1.4.0 // indirect github.com/golang/snappy v0.0.1 // indirect - github.com/google/go-cmp v0.6.0 // indirect github.com/hashicorp/hcl v1.0.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/klauspost/compress v1.17.10 // indirect @@ -34,7 +33,6 @@ require ( github.com/paulbellamy/ratecounter v0.2.0 // indirect github.com/pelletier/go-toml/v2 v2.2.2 // indirect github.com/philippgille/gokv/encoding v0.7.0 // indirect - github.com/philippgille/gokv/leveldb v0.7.0 // indirect github.com/philippgille/gokv/util v0.7.0 // indirect github.com/refraction-networking/utls v1.6.7 // indirect github.com/sagikazarmark/locafero v0.4.0 // indirect diff --git a/go.sum b/go.sum index 60f17a93..7f815386 100644 --- a/go.sum +++ b/go.sum @@ -2,8 +2,8 @@ github.com/CorentinB/warc v0.8.53 h1:xVz3RMdZ6faAqTtLfcK1/yl8ZTansy+B2en//EZLUlM github.com/CorentinB/warc v0.8.53/go.mod h1:NblONkMtoBB4TIigew6F6vakzu0z3YQTKNFS8U2FIn8= github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY= -github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 h1:DklsrG3dyBCFEj5IhUbnKptjxatkF07cF2ak3yi77so= -github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw= +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/cloudflare/circl v1.4.0 h1:BV7h5MgrktNzytKmWjpOtdYrf0lkkbF8YMlBGPhJQrY= github.com/cloudflare/circl v1.4.0/go.mod h1:PDRU+oXvdD7KCtgKxW95M5Z8BpSCJXQORiZFnBQS5QU= github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= @@ -20,6 +20,8 @@ github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7z github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= +github.com/go-test/deep v1.1.0 h1:WOcxcdHcvdgThNXjw0t76K42FXTU7HpNQWHpA2HHNlg= +github.com/go-test/deep v1.1.0/go.mod h1:5C2ZWiW0ErCdrYzpqxLbTX7MG14M9iiw8DgHncVwcsE= github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU= github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM= github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og= @@ -36,6 +38,7 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= +github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= @@ -54,7 +57,9 @@ github.com/miekg/dns v1.1.62/go.mod h1:mvDlcItzm+br7MToIKqkglaGhlFMHJ9DTNNWONWXb github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= +github.com/onsi/ginkgo v1.7.0 h1:WSHQ+IS43OoUrWtD1/bbclrwK8TTH5hzp+umCiuxHgs= github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= +github.com/onsi/gomega v1.4.3 h1:RE1xgDvH7imwFD45h+u2SgIfERHlS2yNG4DObb5BSKU= github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= github.com/paulbellamy/ratecounter v0.2.0 h1:2L/RhJq+HA8gBQImDXtLPrDXK5qAj6ozWVK/zFXVJGs= github.com/paulbellamy/ratecounter v0.2.0/go.mod h1:Hfx1hDpSGoqxkVVpBi/IlYD7kChlfo5C6hzIHwPqfFE= @@ -66,6 +71,8 @@ github.com/philippgille/gokv/encoding v0.7.0 h1:2oxepKzzTsi00iLZBCZ7Rmqrallh9zws github.com/philippgille/gokv/encoding v0.7.0/go.mod h1:yncOBBUciyniPI8t5ECF8XSCwhONE9Rjf3My5IHs3fA= github.com/philippgille/gokv/leveldb v0.7.0 h1:QTH83utBG8knTTFzO1tIF1amKIjz9xxOPLaZrU48kdQ= github.com/philippgille/gokv/leveldb v0.7.0/go.mod h1:EE/vyZ5VwPPWwZHKJYWd/rkqUIJXFykKA5eluazFByc= +github.com/philippgille/gokv/test v0.7.0 h1:0wBKnKaFZlSeHxLXcmUJqK//IQGUMeu+o8B876KCiOM= +github.com/philippgille/gokv/test v0.7.0/go.mod h1:TP/VzO/qAoi6njsfKnRpXKno0hRuzD5wsLnHhtUcVkY= github.com/philippgille/gokv/util v0.7.0 h1:5avUK/a3aSj/aWjhHv4/FkqgMon2B7k2BqFgLcR+DYg= github.com/philippgille/gokv/util v0.7.0/go.mod h1:i9KLHbPxGiHLMhkix/CcDQhpPbCkJy5BkW+RKgwDHMo= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= @@ -138,10 +145,13 @@ golang.org/x/tools v0.25.0/go.mod h1:/vtpO8WL1N9cQC3FN5zPqb//fRXskFHbLKk4OW1Q7rg gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/fsnotify.v1 v1.4.7 h1:xOHLXZwVvI9hhs+cLKq5+I5onOuwQLhQwiu63xxlHs4= gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= +gopkg.in/yaml.v2 v2.2.1 h1:mUhvW9EsL+naU5Q3cakzfE91YhliOondGd6ZrsDBHQE= gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= diff --git a/internal/pkg/stats/counter.go b/internal/pkg/stats/counter.go new file mode 100644 index 00000000..7af2505e --- /dev/null +++ b/internal/pkg/stats/counter.go @@ -0,0 +1,23 @@ +package stats + +import "sync/atomic" + +type counter struct { + count uint64 +} + +func (c *counter) incr(step uint64) { + atomic.AddUint64(&c.count, step) +} + +func (c *counter) decr(step uint64) { + atomic.AddUint64(&c.count, ^uint64(step-1)) +} + +func (c *counter) get() uint64 { + return atomic.LoadUint64(&c.count) +} + +func (c *counter) reset() { + atomic.StoreUint64(&c.count, 0) +} diff --git a/internal/pkg/stats/methods.go b/internal/pkg/stats/methods.go new file mode 100644 index 00000000..268b49c0 --- /dev/null +++ b/internal/pkg/stats/methods.go @@ -0,0 +1,75 @@ +package stats + +///////////////////////// +// URLsCrawled // +///////////////////////// + +// URLsCrawledIncr increments the URLsCrawled counter by 1. +func URLsCrawledIncr() { globalStats.URLsCrawled.incr(1) } + +// URLsCrawledGet returns the current value of the URLsCrawled counter. +func URLsCrawledGet() uint64 { return globalStats.URLsCrawled.get() } + +// URLsCrawledReset resets the URLsCrawled counter to 0. +func URLsCrawledReset() { globalStats.URLsCrawled.reset() } + +///////////////////////// +// SeedsFinished // +///////////////////////// + +// SeedsFinishedIncr increments the SeedsFinished counter by 1. +func SeedsFinishedIncr() { globalStats.SeedsFinished.incr(1) } + +// SeedsFinishedGet returns the current value of the SeedsFinished counter. +func SeedsFinishedGet() uint64 { return globalStats.SeedsFinished.get() } + +// SeedsFinishedReset resets the SeedsFinished counter to 0. +func SeedsFinishedReset() { globalStats.SeedsFinished.reset() } + +////////////////////////// +// PreprocessorRoutines // +////////////////////////// + +// PreprocessorRoutinesIncr increments the PreprocessorRoutines counter by 1. +func PreprocessorRoutinesIncr() { globalStats.PreprocessorRoutines.incr(1) } + +// PreprocessorRoutinesDecr decrements the PreprocessorRoutines counter by 1. +func PreprocessorRoutinesDecr() { globalStats.PreprocessorRoutines.decr(1) } + +// PreprocessorRoutinesGet returns the current value of the PreprocessorRoutines counter. +func PreprocessorRoutinesGet() uint64 { return globalStats.PreprocessorRoutines.get() } + +// PreprocessorRoutinesReset resets the PreprocessorRoutines counter to 0. +func PreprocessorRoutinesReset() { globalStats.PreprocessorRoutines.reset() } + +////////////////////////// +// ArchiverRoutines // +////////////////////////// + +// ArchiverRoutinesIncr increments the ArchiverRoutines counter by 1. +func ArchiverRoutinesIncr() { globalStats.ArchiverRoutines.incr(1) } + +// ArchiverRoutinesDecr decrements the ArchiverRoutines counter by 1. +func ArchiverRoutinesDecr() { globalStats.ArchiverRoutines.decr(1) } + +// ArchiverRoutinesGet returns the current value of the ArchiverRoutines counter. +func ArchiverRoutinesGet() uint64 { return globalStats.ArchiverRoutines.get() } + +// ArchiverRoutinesReset resets the ArchiverRoutines counter to 0. +func ArchiverRoutinesReset() { globalStats.ArchiverRoutines.reset() } + +////////////////////////// +// PostprocessorRoutines // +////////////////////////// + +// PostprocessorRoutinesIncr increments the PostprocessorRoutines counter by 1. +func PostprocessorRoutinesIncr() { globalStats.PostprocessorRoutines.incr(1) } + +// PostprocessorRoutinesDecr decrements the PostprocessorRoutines counter by 1. +func PostprocessorRoutinesDecr() { globalStats.PostprocessorRoutines.decr(1) } + +// PostprocessorRoutinesGet returns the current value of the PostprocessorRoutines counter. +func PostprocessorRoutinesGet() uint64 { return globalStats.PostprocessorRoutines.get() } + +// PostprocessorRoutinesReset resets the PostprocessorRoutines counter to 0. +func PostprocessorRoutinesReset() { globalStats.PostprocessorRoutines.reset() } diff --git a/internal/pkg/stats/rate.go b/internal/pkg/stats/rate.go index 4ba24cc7..ed969f55 100644 --- a/internal/pkg/stats/rate.go +++ b/internal/pkg/stats/rate.go @@ -34,3 +34,9 @@ func (rps *rate) get() uint64 { func (rps *rate) getTotal() uint64 { return atomic.LoadUint64(&rps.count) } + +func (rps *rate) reset() { + atomic.StoreUint64(&rps.count, 0) + atomic.StoreUint64(&rps.lastCount, 0) + atomic.StoreInt64(&rps.lastUpdate, 0) +} diff --git a/internal/pkg/stats/stats.go b/internal/pkg/stats/stats.go index 43f0f34a..2b6a18a6 100644 --- a/internal/pkg/stats/stats.go +++ b/internal/pkg/stats/stats.go @@ -3,8 +3,11 @@ package stats import "sync" type stats struct { - URLsCrawled *rate - SeedsFinished *rate + URLsCrawled *rate + SeedsFinished *rate + PreprocessorRoutines *counter + ArchiverRoutines *counter + PostprocessorRoutines *counter } var ( @@ -14,10 +17,14 @@ var ( func Init() error { var done = false + doOnce.Do(func() { globalStats = &stats{ - URLsCrawled: &rate{}, - SeedsFinished: &rate{}, + URLsCrawled: &rate{}, + SeedsFinished: &rate{}, + PreprocessorRoutines: &counter{}, + ArchiverRoutines: &counter{}, + PostprocessorRoutines: &counter{}, } done = true }) @@ -25,5 +32,14 @@ func Init() error { if !done { return ErrStatsAlreadyInitialized } + return nil } + +func Reset() { + globalStats.URLsCrawled.reset() + globalStats.SeedsFinished.reset() + globalStats.PreprocessorRoutines.reset() + globalStats.ArchiverRoutines.reset() + globalStats.PostprocessorRoutines.reset() +} From a022d0231dba5ff5f4a8961c002895aa0c711357 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Tue, 19 Nov 2024 19:14:06 +0100 Subject: [PATCH 033/295] add: WARC writing --- cmd/get.go | 4 +- internal/pkg/archiver/capture.go | 4 ++ internal/pkg/archiver/warc.go | 79 ++++++++++++++++++++++++++++++++ internal/pkg/config/config.go | 10 ++-- 4 files changed, 90 insertions(+), 7 deletions(-) create mode 100644 internal/pkg/archiver/warc.go diff --git a/cmd/get.go b/cmd/get.go index 65fa8295..c775ca4a 100644 --- a/cmd/get.go +++ b/cmd/get.go @@ -70,11 +70,11 @@ func getCMDsFlags(getCmd *cobra.Command) { getCmd.PersistentFlags().Bool("warc-on-disk", false, "Do not use RAM to store payloads when recording traffic to WARCs, everything will happen on disk (usually used to reduce memory usage).") getCmd.PersistentFlags().Int("warc-pool-size", 1, "Number of concurrent WARC files to write.") getCmd.PersistentFlags().String("warc-temp-dir", "", "Custom directory to use for WARC temporary files.") - getCmd.PersistentFlags().Bool("disable-local-dedupe", false, "Disable local URL agonistic deduplication.") + getCmd.PersistentFlags().Bool("disable-local-dedupe", false, "Disable local URL agnostic deduplication.") getCmd.PersistentFlags().Bool("cert-validation", false, "Enables certificate validation on HTTPS requests.") getCmd.PersistentFlags().Bool("disable-assets-capture", false, "Disable assets capture.") getCmd.PersistentFlags().Int("warc-dedupe-size", 1024, "Minimum size to deduplicate WARC records with revisit records.") - getCmd.PersistentFlags().String("cdx-cookie", "", "Pass custom cookie during CDX requests. Example: 'cdx_auth_token=test_value'") + getCmd.PersistentFlags().String("warc-cdx-cookie", "", "Pass custom cookie during CDX requests. Example: 'cdx_auth_token=test_value'") getCmd.PersistentFlags().Int("warc-size", 1024, "Size of the WARC files in MB.") // Logging flags diff --git a/internal/pkg/archiver/capture.go b/internal/pkg/archiver/capture.go index 88e8cf26..971b8af7 100644 --- a/internal/pkg/archiver/capture.go +++ b/internal/pkg/archiver/capture.go @@ -46,6 +46,10 @@ func Start(inputChan, outputChan chan *models.Item) error { input: inputChan, output: outputChan, } + + // Setup WARC writing HTTP clients + startWARCWriter() + globalArchiver.wg.Add(1) go run() logger.Info("started") diff --git a/internal/pkg/archiver/warc.go b/internal/pkg/archiver/warc.go new file mode 100644 index 00000000..039cd41a --- /dev/null +++ b/internal/pkg/archiver/warc.go @@ -0,0 +1,79 @@ +package archiver + +import ( + "os" + + "github.com/CorentinB/warc" + "github.com/internetarchive/Zeno/internal/pkg/config" +) + +func startWARCWriter() { + // Configure WARC rotator settings + rotatorSettings := warc.NewRotatorSettings() + rotatorSettings.Prefix = config.Get().WARCPrefix + rotatorSettings.WARCWriterPoolSize = config.Get().WARCPoolSize + rotatorSettings.WarcSize = float64(config.Get().WARCSize) + rotatorSettings.OutputDirectory = config.Get().JobPath + + // Configure WARC dedupe settings + dedupeOptions := warc.DedupeOptions{LocalDedupe: !config.Get().DisableLocalDedupe, SizeThreshold: config.Get().WARCDedupeSize} + if config.Get().CDXDedupeServer != "" { + dedupeOptions = warc.DedupeOptions{ + LocalDedupe: !config.Get().DisableLocalDedupe, + CDXDedupe: true, + CDXURL: config.Get().CDXDedupeServer, + CDXCookie: config.Get().CDXCookie, + SizeThreshold: config.Get().WARCDedupeSize, + } + } + + // Configure WARC settings + WARCSettings := warc.HTTPClientSettings{ + RotatorSettings: rotatorSettings, + DedupeOptions: dedupeOptions, + DecompressBody: true, + SkipHTTPStatusCodes: []int{429}, + VerifyCerts: config.Get().CertValidation, + TempDir: config.Get().WARCTempDir, + FullOnDisk: config.Get().WARCOnDisk, + RandomLocalIP: config.Get().RandomLocalIP, + DisableIPv4: config.Get().DisableIPv4, + DisableIPv6: config.Get().DisableIPv6, + IPv6AnyIP: config.Get().IPv6AnyIP, + } + + // Instantiate WARC client + var err error + if config.Get().Proxy != "" { + proxiedWARCSettings := WARCSettings + proxiedWARCSettings.Proxy = config.Get().Proxy + globalArchiver.ClientWithProxy, err = warc.NewWARCWritingHTTPClient(proxiedWARCSettings) + if err != nil { + logger.Error("unable to init proxied WARC HTTP client", "err", err.Error(), "func", "archiver.startWARCWriter") + os.Exit(1) + } + } + + // Even if a proxied client has been set, we want to create an non-proxied one + // if DomainsBypassProxy is used. The domains specified in this slice won't go + // through the proxied client, but through a "normal" client + if config.Get().Proxy == "" || len(config.Get().DomainsBypassProxy) > 0 { + globalArchiver.ClientWithProxy, err = warc.NewWARCWritingHTTPClient(WARCSettings) + if err != nil { + logger.Error("unable to init WARC HTTP client", "err", err.Error(), "func", "archiver.startWARCWriter") + os.Exit(1) + } + } + + go func() { + for err := range globalArchiver.Client.ErrChan { + logger.Error("WARC writer error", "err", err.Err.Error(), "func", err.Func) + } + }() + + go func() { + for err := range globalArchiver.ClientWithProxy.ErrChan { + logger.Error("WARC writer error", "err", err.Err.Error(), "func", err.Func) + } + }() +} diff --git a/internal/pkg/config/config.go b/internal/pkg/config/config.go index 07604e54..573fb07e 100644 --- a/internal/pkg/config/config.go +++ b/internal/pkg/config/config.go @@ -35,10 +35,13 @@ type Config struct { PrometheusPrefix string `mapstructure:"prometheus-prefix"` WARCPrefix string `mapstructure:"warc-prefix"` WARCOperator string `mapstructure:"warc-operator"` - CDXDedupeServer string `mapstructure:"warc-cdx-dedupe-server"` WARCTempDir string `mapstructure:"warc-temp-dir"` WARCSize int `mapstructure:"warc-size"` - CDXCookie string `mapstructure:"cdx-cookie"` + WARCOnDisk bool `mapstructure:"warc-on-disk"` + WARCPoolSize int `mapstructure:"warc-pool-size"` + WARCDedupeSize int `mapstructure:"warc-dedupe-size"` + CDXDedupeServer string `mapstructure:"warc-cdx-dedupe-server"` + CDXCookie string `mapstructure:"warc-cdx-cookie"` HQAddress string `mapstructure:"hq-address"` HQKey string `mapstructure:"hq-key"` HQSecret string `mapstructure:"hq-secret"` @@ -65,8 +68,6 @@ type Config struct { CrawlTimeLimit int `mapstructure:"crawl-time-limit"` CrawlMaxTimeLimit int `mapstructure:"crawl-max-time-limit"` MinSpaceRequired int `mapstructure:"min-space-required"` - WARCPoolSize int `mapstructure:"warc-pool-size"` - WARCDedupeSize int `mapstructure:"warc-dedupe-size"` KeepCookies bool `mapstructure:"keep-cookies"` Headless bool `mapstructure:"headless"` JSON bool `mapstructure:"json"` @@ -76,7 +77,6 @@ type Config struct { Prometheus bool `mapstructure:"prometheus"` DomainsCrawl bool `mapstructure:"domains-crawl"` CaptureAlternatePages bool `mapstructure:"capture-alternate-pages"` - WARCOnDisk bool `mapstructure:"warc-on-disk"` DisableLocalDedupe bool `mapstructure:"disable-local-dedupe"` CertValidation bool `mapstructure:"cert-validation"` DisableAssetsCapture bool `mapstructure:"disable-assets-capture"` From d1ad2cb1e2a94aac4be7248e4d748bcd1b7a5180 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Tue, 19 Nov 2024 19:17:57 +0100 Subject: [PATCH 034/295] make the core steps use stats routines counters --- internal/pkg/archiver/capture.go | 5 +++++ internal/pkg/postprocessor/postprocessor.go | 5 +++++ internal/pkg/preprocessor/preprocessor.go | 5 +++++ 3 files changed, 15 insertions(+) diff --git a/internal/pkg/archiver/capture.go b/internal/pkg/archiver/capture.go index 971b8af7..238ea721 100644 --- a/internal/pkg/archiver/capture.go +++ b/internal/pkg/archiver/capture.go @@ -9,6 +9,7 @@ import ( "github.com/CorentinB/warc" "github.com/internetarchive/Zeno/internal/pkg/config" "github.com/internetarchive/Zeno/internal/pkg/log" + "github.com/internetarchive/Zeno/internal/pkg/stats" "github.com/internetarchive/Zeno/pkg/models" ) @@ -38,6 +39,8 @@ func Start(inputChan, outputChan chan *models.Item) error { "component": "archiver", }) + stats.Init() + once.Do(func() { ctx, cancel := context.WithCancel(context.Background()) globalArchiver = &archiver{ @@ -90,9 +93,11 @@ func run() { if ok { guard <- struct{}{} wg.Add(1) + stats.ArchiverRoutinesIncr() go func() { defer wg.Done() defer func() { <-guard }() + defer stats.ArchiverRoutinesDecr() archive(item) }() } diff --git a/internal/pkg/postprocessor/postprocessor.go b/internal/pkg/postprocessor/postprocessor.go index f5db2eba..7564c334 100644 --- a/internal/pkg/postprocessor/postprocessor.go +++ b/internal/pkg/postprocessor/postprocessor.go @@ -6,6 +6,7 @@ import ( "github.com/internetarchive/Zeno/internal/pkg/config" "github.com/internetarchive/Zeno/internal/pkg/log" + "github.com/internetarchive/Zeno/internal/pkg/stats" "github.com/internetarchive/Zeno/pkg/models" ) @@ -33,6 +34,8 @@ func Start(inputChan, outputChan chan *models.Item) error { "component": "postprocessor", }) + stats.Init() + once.Do(func() { ctx, cancel := context.WithCancel(context.Background()) globalPostprocessor = &postprocessor{ @@ -81,9 +84,11 @@ func run() { if ok { guard <- struct{}{} wg.Add(1) + stats.PostprocessorRoutinesIncr() go func() { defer wg.Done() defer func() { <-guard }() + defer stats.PostprocessorRoutinesDecr() postprocess(item) }() } diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go index fa6c1556..5dfca067 100644 --- a/internal/pkg/preprocessor/preprocessor.go +++ b/internal/pkg/preprocessor/preprocessor.go @@ -9,6 +9,7 @@ import ( "github.com/internetarchive/Zeno/internal/pkg/log" "github.com/internetarchive/Zeno/internal/pkg/preprocessor/seencheck" "github.com/internetarchive/Zeno/internal/pkg/source/hq" + "github.com/internetarchive/Zeno/internal/pkg/stats" "github.com/internetarchive/Zeno/pkg/models" ) @@ -36,6 +37,8 @@ func Start(inputChan, outputChan chan *models.Item) error { "component": "preprocessor", }) + stats.Init() + once.Do(func() { ctx, cancel := context.WithCancel(context.Background()) globalPreprocessor = &preprocessor{ @@ -84,9 +87,11 @@ func run() { if ok { guard <- struct{}{} wg.Add(1) + stats.PreprocessorRoutinesIncr() go func() { defer wg.Done() defer func() { <-guard }() + defer stats.PreprocessorRoutinesDecr() preprocess(item) }() } From 1a2d2fb71d2cc94a7da385e4f959b695a67fa71f Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Tue, 19 Nov 2024 19:23:07 +0100 Subject: [PATCH 035/295] stats: add unit tests to counter --- internal/pkg/stats/counter_test.go | 76 ++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 internal/pkg/stats/counter_test.go diff --git a/internal/pkg/stats/counter_test.go b/internal/pkg/stats/counter_test.go new file mode 100644 index 00000000..ed8323f6 --- /dev/null +++ b/internal/pkg/stats/counter_test.go @@ -0,0 +1,76 @@ +package stats + +import ( + "sync/atomic" + "testing" +) + +func TestCounter_Incr(t *testing.T) { + c := &counter{} + + // Increment the counter by 1 + c.incr(1) + if atomic.LoadUint64(&c.count) != 1 { + t.Errorf("expected count to be 1, got %d", atomic.LoadUint64(&c.count)) + } + + // Increment the counter by 5 + c.incr(5) + if atomic.LoadUint64(&c.count) != 6 { + t.Errorf("expected count to be 6, got %d", atomic.LoadUint64(&c.count)) + } +} + +func TestCounter_Decr(t *testing.T) { + c := &counter{} + + // Increment the counter by 10 + c.incr(10) + if atomic.LoadUint64(&c.count) != 10 { + t.Errorf("expected count to be 10, got %d", atomic.LoadUint64(&c.count)) + } + + // Decrement the counter by 3 + c.decr(3) + if atomic.LoadUint64(&c.count) != 7 { + t.Errorf("expected count to be 7, got %d", atomic.LoadUint64(&c.count)) + } + + // Decrement the counter by 7 + c.decr(7) + if atomic.LoadUint64(&c.count) != 0 { + t.Errorf("expected count to be 0, got %d", atomic.LoadUint64(&c.count)) + } +} + +func TestCounter_Get(t *testing.T) { + c := &counter{} + + // Increment the counter by 4 + c.incr(4) + if c.get() != 4 { + t.Errorf("expected count to be 4, got %d", c.get()) + } + + // Decrement the counter by 2 + c.decr(2) + if c.get() != 2 { + t.Errorf("expected count to be 2, got %d", c.get()) + } +} + +func TestCounter_Reset(t *testing.T) { + c := &counter{} + + // Increment the counter by 8 + c.incr(8) + if atomic.LoadUint64(&c.count) != 8 { + t.Errorf("expected count to be 8, got %d", atomic.LoadUint64(&c.count)) + } + + // Reset the counter + c.reset() + if atomic.LoadUint64(&c.count) != 0 { + t.Errorf("expected count to be 0 after reset, got %d", atomic.LoadUint64(&c.count)) + } +} From 17fed75b37444d92aed85e3408046b53521f5963 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Tue, 19 Nov 2024 20:00:13 +0100 Subject: [PATCH 036/295] global: added error channel, renamed channels to *Ch ; finisher: mock --- .../pkg/archiver/{capture.go => archiver.go} | 28 +++--- internal/pkg/finish/finish.go | 1 - internal/pkg/finisher/error.go | 10 ++ internal/pkg/finisher/finisher.go | 95 +++++++++++++++++++ internal/pkg/postprocessor/postprocessor.go | 28 +++--- internal/pkg/preprocessor/preprocessor.go | 28 +++--- internal/pkg/reactor/reactor.go | 4 +- pkg/models/seed.go | 1 + 8 files changed, 154 insertions(+), 41 deletions(-) rename internal/pkg/archiver/{capture.go => archiver.go} (85%) delete mode 100644 internal/pkg/finish/finish.go create mode 100644 internal/pkg/finisher/error.go create mode 100644 internal/pkg/finisher/finisher.go diff --git a/internal/pkg/archiver/capture.go b/internal/pkg/archiver/archiver.go similarity index 85% rename from internal/pkg/archiver/capture.go rename to internal/pkg/archiver/archiver.go index 238ea721..a1090402 100644 --- a/internal/pkg/archiver/capture.go +++ b/internal/pkg/archiver/archiver.go @@ -14,11 +14,12 @@ import ( ) type archiver struct { - wg sync.WaitGroup - ctx context.Context - cancel context.CancelFunc - input chan *models.Item - output chan *models.Item + wg sync.WaitGroup + ctx context.Context + cancel context.CancelFunc + inputCh chan *models.Item + outputCh chan *models.Item + errorCh chan *models.Item Client *warc.CustomHTTPClient ClientWithProxy *warc.CustomHTTPClient @@ -31,7 +32,7 @@ var ( ) // This functions starts the archiver responsible for capturing the URLs -func Start(inputChan, outputChan chan *models.Item) error { +func Start(inputChan, outputChan, errorChan chan *models.Item) error { var done bool log.Start() @@ -44,10 +45,11 @@ func Start(inputChan, outputChan chan *models.Item) error { once.Do(func() { ctx, cancel := context.WithCancel(context.Background()) globalArchiver = &archiver{ - ctx: ctx, - cancel: cancel, - input: inputChan, - output: outputChan, + ctx: ctx, + cancel: cancel, + inputCh: inputChan, + outputCh: outputChan, + errorCh: errorChan, } // Setup WARC writing HTTP clients @@ -70,7 +72,7 @@ func Stop() { if globalArchiver != nil { globalArchiver.cancel() globalArchiver.wg.Wait() - close(globalArchiver.output) + close(globalArchiver.outputCh) logger.Info("stopped") } } @@ -89,7 +91,7 @@ func run() { case <-globalArchiver.ctx.Done(): logger.Info("shutting down") return - case item, ok := <-globalArchiver.input: + case item, ok := <-globalArchiver.inputCh: if ok { guard <- struct{}{} wg.Add(1) @@ -148,5 +150,5 @@ func archive(item *models.Item) { }() } - globalArchiver.output <- item + globalArchiver.outputCh <- item } diff --git a/internal/pkg/finish/finish.go b/internal/pkg/finish/finish.go deleted file mode 100644 index 5105e663..00000000 --- a/internal/pkg/finish/finish.go +++ /dev/null @@ -1 +0,0 @@ -package finish diff --git a/internal/pkg/finisher/error.go b/internal/pkg/finisher/error.go new file mode 100644 index 00000000..140c3a68 --- /dev/null +++ b/internal/pkg/finisher/error.go @@ -0,0 +1,10 @@ +package finisher + +import "errors" + +var ( + // ErrFinisherAlreadyInitialized is the error returned when the finisher is already initialized + ErrFinisherAlreadyInitialized = errors.New("finisher already initialized") + // ErrFinisherNotInitialized is the error returned when the finisher is not initialized + ErrFinisherNotInitialized = errors.New("finisher not initialized") +) diff --git a/internal/pkg/finisher/finisher.go b/internal/pkg/finisher/finisher.go new file mode 100644 index 00000000..3f98505a --- /dev/null +++ b/internal/pkg/finisher/finisher.go @@ -0,0 +1,95 @@ +package finisher + +import ( + "context" + "sync" + + "github.com/internetarchive/Zeno/internal/pkg/log" + "github.com/internetarchive/Zeno/internal/pkg/reactor" + "github.com/internetarchive/Zeno/pkg/models" +) + +type finisher struct { + ctx context.Context + cancel context.CancelFunc + inputCh chan *models.Item + errorCh chan *models.Item + wg sync.WaitGroup +} + +var ( + globalFinisher *finisher + once sync.Once + logger *log.FieldedLogger +) + +// Start initializes the global finisher with the given input channel. +// This method can only be called once. +func Start(inputChan, errorChan chan *models.Item) error { + var done bool + + log.Start() + logger = log.NewFieldedLogger(&log.Fields{ + "component": "finisher", + }) + + once.Do(func() { + ctx, cancel := context.WithCancel(context.Background()) + globalFinisher = &finisher{ + ctx: ctx, + cancel: cancel, + inputCh: inputChan, + errorCh: errorChan, + } + logger.Debug("initialized") + globalFinisher.wg.Add(1) + go globalFinisher.run() + logger.Info("started") + done = true + }) + + if !done { + return ErrFinisherAlreadyInitialized + } + + return nil +} + +// Stop stops the global finisher. +func Stop() { + if globalFinisher != nil { + logger.Debug("received stop signal") + globalFinisher.cancel() + globalFinisher.wg.Wait() + globalFinisher = nil + once = sync.Once{} + logger.Info("stopped") + } +} + +func (f *finisher) run() { + defer f.wg.Done() + + for { + select { + case <-f.ctx.Done(): + logger.Info("shutting down") + return + case item := <-f.inputCh: + if item == nil { + panic("received nil item") + } + + logger.Debug("received item", "item", item.UUID.String()) + if item.Error != nil { + logger.Error("received item with error", "item", item.UUID.String(), "error", item.Error) + f.errorCh <- item + continue + } + + reactor.MarkAsFinished(item) + + logger.Info("item finished", "item", item.UUID.String()) + } + } +} diff --git a/internal/pkg/postprocessor/postprocessor.go b/internal/pkg/postprocessor/postprocessor.go index 7564c334..34724b25 100644 --- a/internal/pkg/postprocessor/postprocessor.go +++ b/internal/pkg/postprocessor/postprocessor.go @@ -11,11 +11,12 @@ import ( ) type postprocessor struct { - wg sync.WaitGroup - ctx context.Context - cancel context.CancelFunc - input chan *models.Item - output chan *models.Item + wg sync.WaitGroup + ctx context.Context + cancel context.CancelFunc + inputCh chan *models.Item + outputCh chan *models.Item + errorCh chan *models.Item } var ( @@ -26,7 +27,7 @@ var ( // This functions starts the preprocessor responsible for preparing // the seeds sent by the reactor for captures -func Start(inputChan, outputChan chan *models.Item) error { +func Start(inputChan, outputChan, errorChan chan *models.Item) error { var done bool log.Start() @@ -39,10 +40,11 @@ func Start(inputChan, outputChan chan *models.Item) error { once.Do(func() { ctx, cancel := context.WithCancel(context.Background()) globalPostprocessor = &postprocessor{ - ctx: ctx, - cancel: cancel, - input: inputChan, - output: outputChan, + ctx: ctx, + cancel: cancel, + inputCh: inputChan, + outputCh: outputChan, + errorCh: errorChan, } globalPostprocessor.wg.Add(1) go run() @@ -61,7 +63,7 @@ func Stop() { if globalPostprocessor != nil { globalPostprocessor.cancel() globalPostprocessor.wg.Wait() - close(globalPostprocessor.output) + close(globalPostprocessor.outputCh) logger.Info("stopped") } } @@ -80,7 +82,7 @@ func run() { case <-globalPostprocessor.ctx.Done(): logger.Info("shutting down") return - case item, ok := <-globalPostprocessor.input: + case item, ok := <-globalPostprocessor.inputCh: if ok { guard <- struct{}{} wg.Add(1) @@ -98,5 +100,5 @@ func run() { func postprocess(item *models.Item) { // TODO - globalPostprocessor.output <- item + globalPostprocessor.outputCh <- item } diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go index 5dfca067..44afbef1 100644 --- a/internal/pkg/preprocessor/preprocessor.go +++ b/internal/pkg/preprocessor/preprocessor.go @@ -14,11 +14,12 @@ import ( ) type preprocessor struct { - wg sync.WaitGroup - ctx context.Context - cancel context.CancelFunc - input chan *models.Item - output chan *models.Item + wg sync.WaitGroup + ctx context.Context + cancel context.CancelFunc + inputCh chan *models.Item + outputCh chan *models.Item + errorCh chan *models.Item } var ( @@ -29,7 +30,7 @@ var ( // This functions starts the preprocessor responsible for preparing // the seeds sent by the reactor for captures -func Start(inputChan, outputChan chan *models.Item) error { +func Start(inputChan, outputChan, errorChan chan *models.Item) error { var done bool log.Start() @@ -42,10 +43,11 @@ func Start(inputChan, outputChan chan *models.Item) error { once.Do(func() { ctx, cancel := context.WithCancel(context.Background()) globalPreprocessor = &preprocessor{ - ctx: ctx, - cancel: cancel, - input: inputChan, - output: outputChan, + ctx: ctx, + cancel: cancel, + inputCh: inputChan, + outputCh: outputChan, + errorCh: errorChan, } globalPreprocessor.wg.Add(1) go run() @@ -64,7 +66,7 @@ func Stop() { if globalPreprocessor != nil { globalPreprocessor.cancel() globalPreprocessor.wg.Wait() - close(globalPreprocessor.output) + close(globalPreprocessor.outputCh) logger.Info("stopped") } } @@ -83,7 +85,7 @@ func run() { case <-globalPreprocessor.ctx.Done(): logger.Info("shutting down") return - case item, ok := <-globalPreprocessor.input: + case item, ok := <-globalPreprocessor.inputCh: if ok { guard <- struct{}{} wg.Add(1) @@ -198,5 +200,5 @@ func preprocess(item *models.Item) { } // Final step, send the preprocessored item to the output chan of the preprocessor - globalPreprocessor.output <- item + globalPreprocessor.outputCh <- item } diff --git a/internal/pkg/reactor/reactor.go b/internal/pkg/reactor/reactor.go index 53ea6a15..bdf49f59 100644 --- a/internal/pkg/reactor/reactor.go +++ b/internal/pkg/reactor/reactor.go @@ -46,7 +46,7 @@ func Start(maxTokens int, outputChan chan *models.Item) error { input: make(chan *models.Item, maxTokens), output: outputChan, } - logger.Info("initialized") + logger.Debug("initialized") globalReactor.wg.Add(1) go globalReactor.run() logger.Info("started") @@ -63,11 +63,13 @@ func Start(maxTokens int, outputChan chan *models.Item) error { // Stop stops the global reactor and waits for all goroutines to finish. func Stop() { if globalReactor != nil { + logger.Debug("received stop signal") globalReactor.cancel() globalReactor.wg.Wait() close(globalReactor.input) close(globalReactor.tokenPool) once = sync.Once{} + globalReactor = nil logger.Info("stopped") } } diff --git a/pkg/models/seed.go b/pkg/models/seed.go index 57f4421f..d5cdf3d9 100644 --- a/pkg/models/seed.go +++ b/pkg/models/seed.go @@ -12,6 +12,7 @@ type Item struct { Source ItemSource // Source is the source of the item in the pipeline ChildsCaptured bool // ChildsCaptured is the flag to indicate if the child URLs of the item have been captured Childs []*URL // Childs is the list of URLs that have been discovered via the item's URL + Error error // Error message of the seed } // ItemState qualifies the state of a item in the pipeline From 7e599a2ab0f84b2e230b6db6ce11d5b082b8e2b2 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Tue, 19 Nov 2024 20:01:12 +0100 Subject: [PATCH 037/295] finisher: add item error handling mock --- internal/pkg/finisher/finisher.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/internal/pkg/finisher/finisher.go b/internal/pkg/finisher/finisher.go index 3f98505a..4376a988 100644 --- a/internal/pkg/finisher/finisher.go +++ b/internal/pkg/finisher/finisher.go @@ -90,6 +90,16 @@ func (f *finisher) run() { reactor.MarkAsFinished(item) logger.Info("item finished", "item", item.UUID.String()) + case item := <-f.errorCh: + if item == nil { + panic("received nil item") + } + + logger.Info("received item with error", "item", item.UUID.String(), "error", item.Error) + + reactor.MarkAsFinished(item) + + logger.Info("item with error finished", "item", item.UUID.String()) } } } From d7437ee36d03477191f7d7e2f2309400fe006bee Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Tue, 19 Nov 2024 20:04:49 +0100 Subject: [PATCH 038/295] main: add finisher to main and pass seedErrorChan to other steps --- main.go | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/main.go b/main.go index d3ff11ea..09cdfa3a 100644 --- a/main.go +++ b/main.go @@ -15,6 +15,7 @@ import ( "github.com/internetarchive/Zeno/cmd" "github.com/internetarchive/Zeno/internal/pkg/archiver" "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/internetarchive/Zeno/internal/pkg/finisher" "github.com/internetarchive/Zeno/internal/pkg/log" "github.com/internetarchive/Zeno/internal/pkg/postprocessor" "github.com/internetarchive/Zeno/internal/pkg/preprocessor" @@ -38,6 +39,8 @@ func main() { os.Exit(1) } + seedErrorChan := make(chan *models.Item) + // Start the reactor that will receive reactorOutputChan := make(chan *models.Item) err := reactor.Start(config.Get().WorkersCount, reactorOutputChan) @@ -48,7 +51,7 @@ func main() { defer reactor.Stop() preprocessorOutputChan := make(chan *models.Item) - err = preprocessor.Start(reactorOutputChan, preprocessorOutputChan) + err = preprocessor.Start(reactorOutputChan, preprocessorOutputChan, seedErrorChan) if err != nil { logger.Error("error starting preprocessor", "err", err.Error()) return @@ -56,7 +59,7 @@ func main() { defer preprocessor.Stop() archiverOutputChan := make(chan *models.Item) - err = archiver.Start(preprocessorOutputChan, archiverOutputChan) + err = archiver.Start(preprocessorOutputChan, archiverOutputChan, seedErrorChan) if err != nil { logger.Error("error starting archiver", "err", err.Error()) return @@ -64,10 +67,16 @@ func main() { defer archiver.Stop() postprocessorOutputChan := make(chan *models.Item) - err = postprocessor.Start(archiverOutputChan, postprocessorOutputChan) + err = postprocessor.Start(archiverOutputChan, postprocessorOutputChan, seedErrorChan) if err != nil { logger.Error("error starting postprocessor", "err", err.Error()) return } defer postprocessor.Stop() + + err = finisher.Start(postprocessorOutputChan, seedErrorChan) + if err != nil { + logger.Error("error starting finisher", "err", err.Error()) + return + } } From d093a51537e071e1ad7cdde186cf5f7a842c2e91 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Tue, 19 Nov 2024 20:18:27 +0100 Subject: [PATCH 039/295] add test main --- internal/pkg/preprocessor/preprocessor.go | 1 + internal/pkg/reactor/reactor.go | 1 + main.go | 41 +++++++++++++++++++++-- 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go index 44afbef1..783c3ca1 100644 --- a/internal/pkg/preprocessor/preprocessor.go +++ b/internal/pkg/preprocessor/preprocessor.go @@ -87,6 +87,7 @@ func run() { return case item, ok := <-globalPreprocessor.inputCh: if ok { + logger.Info("received item", "item", item.UUID.String()) guard <- struct{}{} wg.Add(1) stats.PreprocessorRoutinesIncr() diff --git a/internal/pkg/reactor/reactor.go b/internal/pkg/reactor/reactor.go index bdf49f59..204ed3a4 100644 --- a/internal/pkg/reactor/reactor.go +++ b/internal/pkg/reactor/reactor.go @@ -98,6 +98,7 @@ func ReceiveFeedback(item *models.Item) error { // ReceiveInsert sends an item to the input channel consuming a token. // It is the responsibility of the sender to set either ItemSourceQueue or ItemSourceHQ, if not set seed will get forced ItemSourceInsert func ReceiveInsert(item *models.Item) error { + logger.Info("received seed", "seed", item.UUID.String()) if globalReactor == nil { return ErrReactorNotInitialized } diff --git a/main.go b/main.go index 09cdfa3a..a360ef61 100644 --- a/main.go +++ b/main.go @@ -11,15 +11,17 @@ package main import ( "fmt" "os" + "time" + "github.com/google/uuid" "github.com/internetarchive/Zeno/cmd" "github.com/internetarchive/Zeno/internal/pkg/archiver" - "github.com/internetarchive/Zeno/internal/pkg/config" "github.com/internetarchive/Zeno/internal/pkg/finisher" "github.com/internetarchive/Zeno/internal/pkg/log" "github.com/internetarchive/Zeno/internal/pkg/postprocessor" "github.com/internetarchive/Zeno/internal/pkg/preprocessor" "github.com/internetarchive/Zeno/internal/pkg/reactor" + "github.com/internetarchive/Zeno/internal/pkg/stats" "github.com/internetarchive/Zeno/pkg/models" ) @@ -43,13 +45,36 @@ func main() { // Start the reactor that will receive reactorOutputChan := make(chan *models.Item) - err := reactor.Start(config.Get().WorkersCount, reactorOutputChan) + // err := reactor.Start(config.Get().WorkersCount, reactorOutputChan) + err := reactor.Start(5, reactorOutputChan) if err != nil { logger.Error("error starting reactor", "err", err.Error()) return } defer reactor.Stop() + // Create mock seeds + seeds := 5 + mockItems := []*models.Item{} + for i := 0; i <= seeds; i++ { + uuid := uuid.New() + mockItems = append(mockItems, &models.Item{ + UUID: &uuid, + URL: &models.URL{Raw: fmt.Sprintf("https://www.deezer.fr/%d", i)}, + Status: models.ItemFresh, + Source: models.ItemSourceHQ, + }) + } + + // Queue mock seeds to the source channel + for _, seed := range mockItems { + err := reactor.ReceiveInsert(seed) + if err != nil { + logger.Error("Error queuing seed to source channel", "error", err.Error()) + return + } + } + preprocessorOutputChan := make(chan *models.Item) err = preprocessor.Start(reactorOutputChan, preprocessorOutputChan, seedErrorChan) if err != nil { @@ -79,4 +104,16 @@ func main() { logger.Error("error starting finisher", "err", err.Error()) return } + + for { + time.Sleep(1 * time.Second) + if len(reactor.GetStateTable()) == 0 { + return + } + fmt.Println("URLsCrawledGet" + string(stats.URLsCrawledGet())) + fmt.Println("SeedsFinishedGet" + string(stats.SeedsFinishedGet())) + fmt.Println("PreprocessorRoutinesGet" + string(stats.PreprocessorRoutinesGet())) + fmt.Println("ArchiverRoutinesGet" + string(stats.ArchiverRoutinesGet())) + fmt.Println("PostprocessorRoutinesGet" + string(stats.PostprocessorRoutinesGet())) + } } From d4b466f1d7dd272b3fe6a43682f454bd661b9153 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Tue, 19 Nov 2024 20:26:40 +0100 Subject: [PATCH 040/295] fix: WARC writers err chans --- internal/pkg/archiver/warc.go | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/internal/pkg/archiver/warc.go b/internal/pkg/archiver/warc.go index 039cd41a..59f9bc17 100644 --- a/internal/pkg/archiver/warc.go +++ b/internal/pkg/archiver/warc.go @@ -52,28 +52,28 @@ func startWARCWriter() { logger.Error("unable to init proxied WARC HTTP client", "err", err.Error(), "func", "archiver.startWARCWriter") os.Exit(1) } + + go func() { + for err := range globalArchiver.ClientWithProxy.ErrChan { + logger.Error("WARC writer error", "err", err.Err.Error(), "func", err.Func) + } + }() } // Even if a proxied client has been set, we want to create an non-proxied one // if DomainsBypassProxy is used. The domains specified in this slice won't go // through the proxied client, but through a "normal" client if config.Get().Proxy == "" || len(config.Get().DomainsBypassProxy) > 0 { - globalArchiver.ClientWithProxy, err = warc.NewWARCWritingHTTPClient(WARCSettings) + globalArchiver.Client, err = warc.NewWARCWritingHTTPClient(WARCSettings) if err != nil { logger.Error("unable to init WARC HTTP client", "err", err.Error(), "func", "archiver.startWARCWriter") os.Exit(1) } - } - - go func() { - for err := range globalArchiver.Client.ErrChan { - logger.Error("WARC writer error", "err", err.Err.Error(), "func", err.Func) - } - }() - go func() { - for err := range globalArchiver.ClientWithProxy.ErrChan { - logger.Error("WARC writer error", "err", err.Err.Error(), "func", err.Func) - } - }() + go func() { + for err := range globalArchiver.Client.ErrChan { + logger.Error("WARC writer error", "err", err.Err.Error(), "func", err.Func) + } + }() + } } From fe395bf175e383dca19faf57dfabdc35d8b64487 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Tue, 19 Nov 2024 20:36:36 +0100 Subject: [PATCH 041/295] fix: various things --- internal/pkg/archiver/archiver.go | 4 ++-- internal/pkg/config/config.go | 3 --- main.go | 18 +++++++++--------- 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/internal/pkg/archiver/archiver.go b/internal/pkg/archiver/archiver.go index a1090402..92236022 100644 --- a/internal/pkg/archiver/archiver.go +++ b/internal/pkg/archiver/archiver.go @@ -113,7 +113,7 @@ func archive(item *models.Item) { var ( URLsToCapture []*models.URL guard = make(chan struct{}, config.Get().MaxConcurrentAssets) - wg *sync.WaitGroup + wg sync.WaitGroup ) // Determines the URLs that need to be captured, if the item's status is fresh we need @@ -136,7 +136,7 @@ func archive(item *models.Item) { resp *http.Response ) - if config.Get().Proxy == "" { + if config.Get().Proxy != "" { resp, err = globalArchiver.ClientWithProxy.Do(URL.GetRequest()) } else { resp, err = globalArchiver.Client.Do(URL.GetRequest()) diff --git a/internal/pkg/config/config.go b/internal/pkg/config/config.go index 573fb07e..a402b97f 100644 --- a/internal/pkg/config/config.go +++ b/internal/pkg/config/config.go @@ -198,9 +198,6 @@ func GenerateCrawlConfig() error { config.JobPath = path.Join("jobs", config.Job) - // TODO - // Crawl.Workers = NewPool(uint(config.WorkersCount), time.Second*60, c) - config.UseSeencheck = !config.DisableSeencheck // Defaults --max-crawl-time-limit to 10% more than --crawl-time-limit diff --git a/main.go b/main.go index a360ef61..842c1e9a 100644 --- a/main.go +++ b/main.go @@ -66,15 +66,6 @@ func main() { }) } - // Queue mock seeds to the source channel - for _, seed := range mockItems { - err := reactor.ReceiveInsert(seed) - if err != nil { - logger.Error("Error queuing seed to source channel", "error", err.Error()) - return - } - } - preprocessorOutputChan := make(chan *models.Item) err = preprocessor.Start(reactorOutputChan, preprocessorOutputChan, seedErrorChan) if err != nil { @@ -105,6 +96,15 @@ func main() { return } + // Queue mock seeds to the source channel + for _, seed := range mockItems { + err := reactor.ReceiveInsert(seed) + if err != nil { + logger.Error("Error queuing seed to source channel", "error", err.Error()) + return + } + } + for { time.Sleep(1 * time.Second) if len(reactor.GetStateTable()) == 0 { From c2211dac8c606bdaaeb9cbf28bb47b9b915cd6f4 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Tue, 19 Nov 2024 20:39:29 +0100 Subject: [PATCH 042/295] add: logging --- internal/pkg/postprocessor/postprocessor.go | 3 ++- internal/pkg/preprocessor/preprocessor.go | 4 +--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/internal/pkg/postprocessor/postprocessor.go b/internal/pkg/postprocessor/postprocessor.go index 34724b25..a7666e09 100644 --- a/internal/pkg/postprocessor/postprocessor.go +++ b/internal/pkg/postprocessor/postprocessor.go @@ -84,6 +84,7 @@ func run() { return case item, ok := <-globalPostprocessor.inputCh: if ok { + logger.Info("received item", "item", item.UUID.String()) guard <- struct{}{} wg.Add(1) stats.PostprocessorRoutinesIncr() @@ -92,6 +93,7 @@ func run() { defer func() { <-guard }() defer stats.PostprocessorRoutinesDecr() postprocess(item) + globalPostprocessor.outputCh <- item }() } } @@ -100,5 +102,4 @@ func run() { func postprocess(item *models.Item) { // TODO - globalPostprocessor.outputCh <- item } diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go index 783c3ca1..8b0e074e 100644 --- a/internal/pkg/preprocessor/preprocessor.go +++ b/internal/pkg/preprocessor/preprocessor.go @@ -96,6 +96,7 @@ func run() { defer func() { <-guard }() defer stats.PreprocessorRoutinesDecr() preprocess(item) + globalPreprocessor.outputCh <- item }() } } @@ -199,7 +200,4 @@ func preprocess(item *models.Item) { item.Childs[i].SetRequest(req) } } - - // Final step, send the preprocessored item to the output chan of the preprocessor - globalPreprocessor.outputCh <- item } From d883e7bfc4f8de251faad6942a7c1170b7801e82 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Tue, 19 Nov 2024 20:50:57 +0100 Subject: [PATCH 043/295] working pipeline with test main --- internal/pkg/archiver/archiver.go | 1 + main.go | 23 +++++++++-------------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/internal/pkg/archiver/archiver.go b/internal/pkg/archiver/archiver.go index 92236022..c8d2c447 100644 --- a/internal/pkg/archiver/archiver.go +++ b/internal/pkg/archiver/archiver.go @@ -93,6 +93,7 @@ func run() { return case item, ok := <-globalArchiver.inputCh: if ok { + logger.Info("received item", "item", item.UUID.String()) guard <- struct{}{} wg.Add(1) stats.ArchiverRoutinesIncr() diff --git a/main.go b/main.go index 842c1e9a..ff313626 100644 --- a/main.go +++ b/main.go @@ -21,7 +21,6 @@ import ( "github.com/internetarchive/Zeno/internal/pkg/postprocessor" "github.com/internetarchive/Zeno/internal/pkg/preprocessor" "github.com/internetarchive/Zeno/internal/pkg/reactor" - "github.com/internetarchive/Zeno/internal/pkg/stats" "github.com/internetarchive/Zeno/pkg/models" ) @@ -51,19 +50,18 @@ func main() { logger.Error("error starting reactor", "err", err.Error()) return } - defer reactor.Stop() // Create mock seeds seeds := 5 - mockItems := []*models.Item{} - for i := 0; i <= seeds; i++ { + mockItems := make([]*models.Item, 5) + for i := 0; i < seeds; i++ { uuid := uuid.New() - mockItems = append(mockItems, &models.Item{ + mockItems[i] = &models.Item{ UUID: &uuid, URL: &models.URL{Raw: fmt.Sprintf("https://www.deezer.fr/%d", i)}, Status: models.ItemFresh, Source: models.ItemSourceHQ, - }) + } } preprocessorOutputChan := make(chan *models.Item) @@ -72,7 +70,6 @@ func main() { logger.Error("error starting preprocessor", "err", err.Error()) return } - defer preprocessor.Stop() archiverOutputChan := make(chan *models.Item) err = archiver.Start(preprocessorOutputChan, archiverOutputChan, seedErrorChan) @@ -80,7 +77,6 @@ func main() { logger.Error("error starting archiver", "err", err.Error()) return } - defer archiver.Stop() postprocessorOutputChan := make(chan *models.Item) err = postprocessor.Start(archiverOutputChan, postprocessorOutputChan, seedErrorChan) @@ -88,7 +84,6 @@ func main() { logger.Error("error starting postprocessor", "err", err.Error()) return } - defer postprocessor.Stop() err = finisher.Start(postprocessorOutputChan, seedErrorChan) if err != nil { @@ -108,12 +103,12 @@ func main() { for { time.Sleep(1 * time.Second) if len(reactor.GetStateTable()) == 0 { + finisher.Stop() + postprocessor.Stop() + archiver.Stop() + preprocessor.Stop() + reactor.Stop() return } - fmt.Println("URLsCrawledGet" + string(stats.URLsCrawledGet())) - fmt.Println("SeedsFinishedGet" + string(stats.SeedsFinishedGet())) - fmt.Println("PreprocessorRoutinesGet" + string(stats.PreprocessorRoutinesGet())) - fmt.Println("ArchiverRoutinesGet" + string(stats.ArchiverRoutinesGet())) - fmt.Println("PostprocessorRoutinesGet" + string(stats.PostprocessorRoutinesGet())) } } From f3f20ac2f149d2c90c278fc7b7b63ec69ff2e948 Mon Sep 17 00:00:00 2001 From: Will Howes Date: Tue, 19 Nov 2024 22:50:31 +0100 Subject: [PATCH 044/295] rename validateURL to normalizeURL --- internal/pkg/preprocessor/preprocessor.go | 4 ++-- internal/pkg/preprocessor/url.go | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go index 8b0e074e..c63a69d8 100644 --- a/internal/pkg/preprocessor/preprocessor.go +++ b/internal/pkg/preprocessor/preprocessor.go @@ -118,7 +118,7 @@ func preprocess(item *models.Item) { URLType = "seed" // Validate the item's URL itself - err = validateURL(item.URL, nil) + err = normalizeURL(item.URL, nil) if err != nil { logger.Warn("unable to validate URL", "url", item.URL.Raw, "err", err.Error(), "func", "preprocessor.preprocessor") return @@ -132,7 +132,7 @@ func preprocess(item *models.Item) { // Validate the URLs of the child items for i := 0; i < len(item.Childs); { - err = validateURL(item.Childs[i], item.URL) + err = normalizeURL(item.Childs[i], item.URL) if err != nil { // If we can't validate an URL, we remove it from the list of childs logger.Warn("unable to validate URL", "url", item.Childs[i].Raw, "err", err.Error(), "func", "preprocessor.preprocessor") diff --git a/internal/pkg/preprocessor/url.go b/internal/pkg/preprocessor/url.go index 29c68f9a..84be7696 100644 --- a/internal/pkg/preprocessor/url.go +++ b/internal/pkg/preprocessor/url.go @@ -2,7 +2,7 @@ package preprocessor import "github.com/internetarchive/Zeno/pkg/models" -func validateURL(URL *models.URL, parentURL *models.URL) (err error) { +func normalizeURL(URL *models.URL, parentURL *models.URL) (err error) { // Validate the URL, REMOVE FRAGMENTS, try to fix it, make it absolute if needed, etc. return URL.Parse() } From a6883773c33e06c1e9fe1b4bf7ee689ed444441d Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Tue, 19 Nov 2024 23:26:09 +0100 Subject: [PATCH 045/295] add: HQ consumer --- .old/internal/pkg/crawl/config.go | 3 - cmd/get.go | 4 +- cmd/get_hq.go | 4 +- internal/pkg/config/config.go | 9 +- internal/pkg/source/hq/consumer.go | 139 ++++++++++++++++++++ internal/pkg/source/hq/error.go | 8 ++ internal/pkg/source/hq/hq.go | 184 +++++++++------------------ internal/pkg/source/hq/seencheck.go | 2 +- internal/pkg/source/hq/utils.go | 12 ++ internal/pkg/source/hq/utils_test.go | 26 ++++ internal/pkg/source/hq/websocket.go | 4 +- pkg/models/url.go | 8 +- 12 files changed, 264 insertions(+), 139 deletions(-) create mode 100644 internal/pkg/source/hq/consumer.go create mode 100644 internal/pkg/source/hq/error.go create mode 100644 internal/pkg/source/hq/utils.go create mode 100644 internal/pkg/source/hq/utils_test.go diff --git a/.old/internal/pkg/crawl/config.go b/.old/internal/pkg/crawl/config.go index a2bae436..06f80fc6 100644 --- a/.old/internal/pkg/crawl/config.go +++ b/.old/internal/pkg/crawl/config.go @@ -116,9 +116,7 @@ type Crawl struct { HQStrategy string HQBatchConcurrency int HQBatchSize int - HQContinuousPull bool HQClient *gocrawlhq.Client - HQConsumerState string HQFinishedChannel chan *queue.Item HQProducerChannel chan *queue.Item HQChannelsWg *sync.WaitGroup @@ -321,7 +319,6 @@ func GenerateCrawlConfig(config *config.Config) (*Crawl, error) { c.HQStrategy = config.HQStrategy c.HQBatchSize = int(config.HQBatchSize) c.HQBatchConcurrency = config.HQBatchConcurrency - c.HQContinuousPull = config.HQContinuousPull c.HQRateLimitingSendBack = config.HQRateLimitSendBack // Handover mechanism diff --git a/cmd/get.go b/cmd/get.go index c775ca4a..5f17726e 100644 --- a/cmd/get.go +++ b/cmd/get.go @@ -28,7 +28,7 @@ func getCMDsFlags(getCmd *cobra.Command) { getCmd.PersistentFlags().String("job", "", "Job name to use, will determine the path for the persistent queue, seencheck database, and WARC files.") getCmd.PersistentFlags().IntP("workers", "w", 1, "Number of concurrent workers to run.") getCmd.PersistentFlags().Int("max-concurrent-assets", 8, "Max number of concurrent assets to fetch PER worker. E.g. if you have 100 workers and this setting at 8, Zeno could do up to 800 concurrent requests at any time.") - getCmd.PersistentFlags().Uint8("max-hops", 0, "Maximum number of hops to execute.") + getCmd.PersistentFlags().Int("max-hops", 0, "Maximum number of hops to execute.") getCmd.PersistentFlags().String("cookies", "", "File containing cookies that will be used for requests.") getCmd.PersistentFlags().Bool("keep-cookies", false, "Keep a global cookie jar") getCmd.PersistentFlags().Bool("headless", false, "Use headless browsers instead of standard GET requests.") @@ -95,7 +95,7 @@ func getCMDsFlags(getCmd *cobra.Command) { // Aliases shouldn't be used as proper flags nor declared in the config struct // Aliases should be marked as deprecated to inform the user base // Aliases values should be copied to the proper flag in the config/config.go:handleFlagsAliases() function - getCmd.PersistentFlags().Uint8("hops", 0, "Maximum number of hops to execute.") + getCmd.PersistentFlags().Int("hops", 0, "Maximum number of hops to execute.") getCmd.PersistentFlags().MarkDeprecated("hops", "use --max-hops instead") getCmd.PersistentFlags().MarkHidden("hops") diff --git a/cmd/get_hq.go b/cmd/get_hq.go index b913ee35..d179853b 100644 --- a/cmd/get_hq.go +++ b/cmd/get_hq.go @@ -32,8 +32,8 @@ func getHQCmdFlags(getHQCmd *cobra.Command) { getHQCmd.PersistentFlags().String("hq-project", "", "Crawl HQ project.") getHQCmd.PersistentFlags().Bool("hq-continuous-pull", false, "If turned on, the crawler will pull URLs from Crawl HQ continuously.") getHQCmd.PersistentFlags().String("hq-strategy", "lifo", "Crawl HQ feeding strategy.") - getHQCmd.PersistentFlags().Int64("hq-batch-size", 0, "Crawl HQ feeding batch size.") - getHQCmd.PersistentFlags().Int64("hq-batch-concurrency", 1, "Number of concurrent requests to do to get the --hq-batch-size, if batch size is 300 and batch-concurrency is 10, 30 requests will be done concurrently.") + getHQCmd.PersistentFlags().Int("hq-batch-size", 0, "Crawl HQ feeding batch size.") + getHQCmd.PersistentFlags().Int("hq-batch-concurrency", 1, "Number of concurrent requests to do to get the --hq-batch-size, if batch size is 300 and batch-concurrency is 10, 30 requests will be done concurrently.") getHQCmd.PersistentFlags().Bool("hq-rate-limiting-send-back", false, "If turned on, the crawler will send back URLs that hit a rate limit to crawl HQ.") getHQCmd.MarkPersistentFlagRequired("hq-address") diff --git a/internal/pkg/config/config.go b/internal/pkg/config/config.go index a402b97f..464c2f13 100644 --- a/internal/pkg/config/config.go +++ b/internal/pkg/config/config.go @@ -47,7 +47,7 @@ type Config struct { HQSecret string `mapstructure:"hq-secret"` HQProject string `mapstructure:"hq-project"` HQStrategy string `mapstructure:"hq-strategy"` - HQBatchSize int64 `mapstructure:"hq-batch-size"` + HQBatchSize int `mapstructure:"hq-batch-size"` HQBatchConcurrency int `mapstructure:"hq-batch-concurrency"` LogFileOutputDir string `mapstructure:"log-file-output-dir"` ElasticSearchUsername string `mapstructure:"es-user"` @@ -61,9 +61,9 @@ type Config struct { ElasticSearchURLs []string `mapstructure:"es-url"` WorkersCount int `mapstructure:"workers"` MaxConcurrentAssets int `mapstructure:"max-concurrent-assets"` - MaxHops uint8 `mapstructure:"max-hops"` - MaxRedirect uint8 `mapstructure:"max-redirect"` - MaxRetry uint8 `mapstructure:"max-retry"` + MaxHops int `mapstructure:"max-hops"` + MaxRedirect int `mapstructure:"max-redirect"` + MaxRetry int `mapstructure:"max-retry"` HTTPTimeout int `mapstructure:"http-timeout"` CrawlTimeLimit int `mapstructure:"crawl-time-limit"` CrawlMaxTimeLimit int `mapstructure:"crawl-max-time-limit"` @@ -81,7 +81,6 @@ type Config struct { CertValidation bool `mapstructure:"cert-validation"` DisableAssetsCapture bool `mapstructure:"disable-assets-capture"` HQ bool // Special field to check if HQ is enabled depending on the command called - HQContinuousPull bool `mapstructure:"hq-continuous-pull"` HQRateLimitSendBack bool `mapstructure:"hq-rate-limiting-send-back"` NoStdoutLogging bool `mapstructure:"no-stdout-log"` NoBatchWriteWAL bool `mapstructure:"ultrasafe-queue"` diff --git a/internal/pkg/source/hq/consumer.go b/internal/pkg/source/hq/consumer.go new file mode 100644 index 00000000..9c0a375f --- /dev/null +++ b/internal/pkg/source/hq/consumer.go @@ -0,0 +1,139 @@ +package hq + +import ( + "sync" + + "github.com/google/uuid" + "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/internetarchive/Zeno/pkg/models" + "github.com/internetarchive/gocrawlhq" +) + +func consumer() { + var wg sync.WaitGroup // WaitGroup to track batch-sending goroutines + + for { + select { + case <-globalHQ.consumerStopChan: + // Received signal to stop + // Wait for all batch-sending goroutines to finish + wg.Wait() + return + default: + // This is purposely evaluated every time, + // because the value of workers might change + // during the crawl in the future (to be implemented) + var HQBatchSize = config.Get().WorkersCount + + // If a specific HQ batch size is set, use it + if config.Get().HQBatchSize != 0 { + HQBatchSize = config.Get().HQBatchSize + } + + // Get a batch of URLs from crawl HQ + URLs, err := getURLs(HQBatchSize) + if err != nil { + logger.Error("error getting new URLs from crawl HQ", "err", err.Error(), "func", "hq.Consumer") + continue + } + + // Channel to receive pre-fetch signal + prefetchSignal := make(chan struct{}, 1) + + // Increment the WaitGroup counter + wg.Add(1) + + // Send the URLs to the reactor in a goroutine + go func(urls []gocrawlhq.URL) { + defer wg.Done() // Decrement the WaitGroup counter when done + + totalURLs := len(urls) + for i, URL := range urls { + UUID := uuid.New() + globalHQ.outputChan <- &models.Item{ + UUID: &UUID, + URL: &models.URL{ + Raw: URL.Value, + Hop: pathToHop(URL.Path), + }, + Status: models.ItemFresh, + Source: models.ItemSourceHQ, + } + + // When one-third of the URLs are left, send a pre-fetch signal + if i == totalURLs-totalURLs/3 { + // Send pre-fetch signal to Consumer + select { + case prefetchSignal <- struct{}{}: + default: + // Signal already sent; do nothing + } + } + + // Check if stop signal is received to exit early + select { + case <-globalHQ.consumerStopChan: + // Stop signal received, exit the goroutine + return + default: + // Continue sending URLs + } + } + }(URLs) + + // Wait for pre-fetch signal or stop signal + select { + case <-prefetchSignal: + // Received pre-fetch signal; continue to fetch next batch + continue + case <-globalHQ.consumerStopChan: + // Received signal to stop + // Wait for all batch-sending goroutines to finish + wg.Wait() + return + } + } + } +} + +func getURLs(HQBatchSize int) ([]gocrawlhq.URL, error) { + if config.Get().HQBatchConcurrency == 1 { + return globalHQ.client.Get(HQBatchSize, config.Get().HQStrategy) + } + + var mu sync.Mutex + var wg sync.WaitGroup + batchSize := HQBatchSize / config.Get().HQBatchConcurrency + URLsChan := make(chan []gocrawlhq.URL, config.Get().HQBatchConcurrency) + var URLs []gocrawlhq.URL + + // Start goroutines to get URLs from crawl HQ, each will request + // HQBatchSize / HQConcurrentBatch URLs + for i := 0; i < config.Get().HQBatchConcurrency; i++ { + wg.Add(1) + go func() { + defer wg.Done() + URLs, err := globalHQ.client.Get(batchSize, config.Get().HQStrategy) + if err != nil { + logger.Error("error getting new URLs from crawl HQ", "err", err.Error(), "func", "hq.getURLs") + return + } + URLsChan <- URLs + }() + } + + // Wait for all goroutines to finish + go func() { + wg.Wait() + close(URLsChan) + }() + + // Collect all URLs from the channels + for URLsFromChan := range URLsChan { + mu.Lock() + URLs = append(URLs, URLsFromChan...) + mu.Unlock() + } + + return URLs, nil +} diff --git a/internal/pkg/source/hq/error.go b/internal/pkg/source/hq/error.go new file mode 100644 index 00000000..82f070e6 --- /dev/null +++ b/internal/pkg/source/hq/error.go @@ -0,0 +1,8 @@ +package hq + +import "errors" + +var ( + // ErrHQAlreadyInitialized is the error returned when the postprocessor is already initialized + ErrHQAlreadyInitialized = errors.New("hq client already initialized") +) diff --git a/internal/pkg/source/hq/hq.go b/internal/pkg/source/hq/hq.go index 2c20af97..545116cd 100644 --- a/internal/pkg/source/hq/hq.go +++ b/internal/pkg/source/hq/hq.go @@ -1,13 +1,74 @@ package hq import ( + "context" + "os" + "sync" + + "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/internetarchive/Zeno/internal/pkg/log" + "github.com/internetarchive/Zeno/internal/pkg/stats" + "github.com/internetarchive/Zeno/pkg/models" "github.com/internetarchive/gocrawlhq" ) +type hq struct { + wg sync.WaitGroup + ctx context.Context + cancel context.CancelFunc + inputChan chan *models.Item + outputChan chan *models.Item + client *gocrawlhq.Client + consumerStopChan chan struct{} +} + var ( - HQClient *gocrawlhq.Client + globalHQ *hq + once sync.Once + logger *log.FieldedLogger ) +func Start(outputChan chan *models.Item) error { + var done bool + + log.Start() + logger = log.NewFieldedLogger(&log.Fields{ + "component": "hq", + }) + + stats.Init() + + once.Do(func() { + var err error + + globalHQ.client, err = gocrawlhq.Init(config.Get().HQKey, config.Get().HQSecret, config.Get().HQProject, config.Get().HQAddress, "") + if err != nil { + logger.Error("error initializing crawl HQ client", "err", err.Error(), "func", "hq.Start") + os.Exit(1) + } + + globalHQ.wg.Add(2) + go consumer() + // go Finisher() + done = true + }) + + if !done { + return ErrHQAlreadyInitialized + } + + return nil +} + +func Stop() { + if globalHQ != nil { + globalHQ.cancel() + globalHQ.wg.Wait() + close(globalHQ.outputChan) + logger.Info("stopped") + } +} + // func HQProducer() { // defer c.HQChannelsWg.Done() @@ -105,127 +166,6 @@ var ( // terminateProducer <- true // } -// func HQConsumer() { -// for { -// c.HQConsumerState = "running" - -// // This is on purpose evaluated every time, -// // because the value of workers will maybe change -// // during the crawl in the future (to be implemented) -// var HQBatchSize = int(c.Workers.Count) - -// if c.Finished.Get() { -// c.HQConsumerState = "finished" -// c.Log.Error("crawl finished, stopping HQ consumer") -// break -// } - -// // If HQContinuousPull is set to true, we will pull URLs from HQ continuously, -// // otherwise we will only pull URLs when needed (and when the crawl is not paused) -// for (c.Queue.GetStats().TotalElements > HQBatchSize && !c.HQContinuousPull) || c.Paused.Get() || c.Queue.HandoverOpen.Get() { -// c.HQConsumerState = "waiting" -// c.Log.Info("HQ producer waiting", "paused", c.Paused.Get(), "handoverOpen", c.Queue.HandoverOpen.Get(), "queueSize", c.Queue.GetStats().TotalElements) -// time.Sleep(time.Millisecond * 50) -// continue -// } - -// // If a specific HQ batch size is set, use it -// if c.HQBatchSize != 0 { -// HQBatchSize = c.HQBatchSize -// } - -// // get batch from crawl HQ -// c.HQConsumerState = "waitingOnFeed" -// var URLs []gocrawlhq.URL -// var err error -// if c.HQBatchConcurrency == 1 { -// URLs, err = c.HQClient.Get(HQBatchSize, c.HQStrategy) -// if err != nil { -// // c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ -// // "batchSize": HQBatchSize, -// // "err": err, -// // })).Debug("error getting new URLs from crawl HQ") -// continue -// } -// } else { -// var mu sync.Mutex -// var wg sync.WaitGroup -// batchSize := HQBatchSize / c.HQBatchConcurrency -// URLsChan := make(chan []gocrawlhq.URL, c.HQBatchConcurrency) - -// // Start goroutines to get URLs from crawl HQ, each will request -// // HQBatchSize / HQConcurrentBatch URLs -// for i := 0; i < c.HQBatchConcurrency; i++ { -// wg.Add(1) -// go func() { -// defer wg.Done() -// URLs, err := c.HQClient.Get(batchSize, c.HQStrategy) -// if err != nil { -// // c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ -// // "batchSize": batchSize, -// // "err": err, -// // })).Debug("error getting new URLs from crawl HQ") -// return -// } -// URLsChan <- URLs -// }() -// } - -// // Wait for all goroutines to finish -// go func() { -// wg.Wait() -// close(URLsChan) -// }() - -// // Collect all URLs from the channels -// for URLsFromChan := range URLsChan { -// mu.Lock() -// URLs = append(URLs, URLsFromChan...) -// mu.Unlock() -// } -// } -// c.HQConsumerState = "feedCompleted" - -// // send all URLs received in the batch to the queue -// var items = make([]*queue.Item, 0, len(URLs)) -// if len(URLs) > 0 { -// for _, URL := range URLs { -// c.HQConsumerState = "urlParse" -// newURL, err := url.Parse(URL.Value) -// if err != nil { -// c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ -// "url": URL.Value, -// "batchSize": HQBatchSize, -// "err": err, -// })).Error("unable to parse URL received from crawl HQ, discarding") -// continue -// } - -// c.HQConsumerState = "newItem" -// newItem, err := queue.NewItem(newURL, nil, "seed", uint64(strings.Count(URL.Path, "L")), URL.ID, false) -// if err != nil { -// c.Log.WithFields(c.genLogFields(err, newURL, map[string]interface{}{ -// "url": URL.Value, -// "batchSize": HQBatchSize, -// "err": err, -// })).Error("unable to create new item from URL received from crawl HQ, discarding") -// continue -// } - -// c.HQConsumerState = "append" -// items = append(items, newItem) -// } -// } - -// c.HQConsumerState = "enqueue" -// err = c.Queue.BatchEnqueue(items...) -// if err != nil { -// c.Log.Error("unable to enqueue URL batch received from crawl HQ, discarding", "error", err) -// continue -// } -// } -// } - // func HQFinisher() { // defer c.HQChannelsWg.Done() diff --git a/internal/pkg/source/hq/seencheck.go b/internal/pkg/source/hq/seencheck.go index bafd0554..8d857f03 100644 --- a/internal/pkg/source/hq/seencheck.go +++ b/internal/pkg/source/hq/seencheck.go @@ -19,7 +19,7 @@ func SeencheckURLs(URLsType string, URLs ...*models.URL) (seencheckedURLs []*mod }) } - outputURLs, err := HQClient.Seencheck(discoveredURLs) + outputURLs, err := globalHQ.client.Seencheck(discoveredURLs) if err != nil { slog.Error("error sending seencheck payload to crawl HQ", "err", err.Error()) return URLs, err diff --git a/internal/pkg/source/hq/utils.go b/internal/pkg/source/hq/utils.go new file mode 100644 index 00000000..3276611e --- /dev/null +++ b/internal/pkg/source/hq/utils.go @@ -0,0 +1,12 @@ +package hq + +func pathToHop(path string) (hop int) { + // For each L in the path, add 1 hop + for _, c := range path { + if c == 'L' { + hop++ + } + } + + return hop +} diff --git a/internal/pkg/source/hq/utils_test.go b/internal/pkg/source/hq/utils_test.go new file mode 100644 index 00000000..b5533b12 --- /dev/null +++ b/internal/pkg/source/hq/utils_test.go @@ -0,0 +1,26 @@ +package hq + +import ( + "testing" +) + +func TestPathToHop(t *testing.T) { + tests := []struct { + path string + expected int + }{ + {"", 0}, + {"L", 1}, + {"LL", 2}, + {"LRL", 2}, + {"LLLL", 4}, + {"RLRLRL", 3}, + } + + for _, test := range tests { + result := pathToHop(test.path) + if result != test.expected { + t.Errorf("For path %q, expected %d hops, but got %d", test.path, test.expected, result) + } + } +} diff --git a/internal/pkg/source/hq/websocket.go b/internal/pkg/source/hq/websocket.go index eda4d5b5..dda75680 100644 --- a/internal/pkg/source/hq/websocket.go +++ b/internal/pkg/source/hq/websocket.go @@ -21,7 +21,7 @@ func Websocket() { }() for { - err := HQClient.Identify(&gocrawlhq.IdentifyMessage{ + err := globalHQ.client.Identify(&gocrawlhq.IdentifyMessage{ Project: config.Get().HQProject, Job: config.Get().Job, IP: utils.GetOutboundIP().String(), @@ -31,7 +31,7 @@ func Websocket() { if err != nil { slog.Error("error sending identify payload to Crawl HQ, trying to reconnect", "err", err.Error()) - err = HQClient.InitWebsocketConn() + err = globalHQ.client.InitWebsocketConn() if err != nil { slog.Error("error initializing websocket connection to crawl HQ", "err", err.Error()) } diff --git a/pkg/models/url.go b/pkg/models/url.go index 3e2eab47..3aea0878 100644 --- a/pkg/models/url.go +++ b/pkg/models/url.go @@ -14,7 +14,7 @@ type URL struct { Raw string parsed *url.URL request *http.Request - hop int // This determines the number of hops this item is the result of, a hop is a "jump" from 1 page to another page + Hop int // This determines the number of hops this item is the result of, a hop is a "jump" from 1 page to another page } func (u *URL) Parse() (err error) { @@ -34,8 +34,12 @@ func (u *URL) GetParsed() *url.URL { return u.parsed } +func (u *URL) SetHop(hop int) { + u.Hop = hop +} + func (u *URL) GetHop() int { - return u.hop + return u.Hop } // String exists to apply some custom stuff, in opposition of simply From c067658f3ba6bc3a3307276879d540dcecb87b30 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Tue, 19 Nov 2024 23:53:17 +0100 Subject: [PATCH 046/295] fix: WARC writing --- internal/pkg/archiver/archiver.go | 10 ++++++++++ internal/pkg/archiver/warc.go | 20 ++++++++++++++++++++ main.go | 14 +++++++++----- 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/internal/pkg/archiver/archiver.go b/internal/pkg/archiver/archiver.go index c8d2c447..5acf529a 100644 --- a/internal/pkg/archiver/archiver.go +++ b/internal/pkg/archiver/archiver.go @@ -142,6 +142,14 @@ func archive(item *models.Item) { } else { resp, err = globalArchiver.Client.Do(URL.GetRequest()) } + if err != nil { + logger.Error("unable to execute request", "err", err.Error(), "func", "archiver.archive") + return + } + + if resp.StatusCode != 200 { + logger.Warn("non-200 status code", "status_code", resp.StatusCode) + } // For now, we only consume it _, err = io.Copy(io.Discard, resp.Body) @@ -151,5 +159,7 @@ func archive(item *models.Item) { }() } + wg.Wait() + globalArchiver.outputCh <- item } diff --git a/internal/pkg/archiver/warc.go b/internal/pkg/archiver/warc.go index 59f9bc17..25ddb3e0 100644 --- a/internal/pkg/archiver/warc.go +++ b/internal/pkg/archiver/warc.go @@ -77,3 +77,23 @@ func startWARCWriter() { }() } } + +func GetClients() (clients []*warc.CustomHTTPClient) { + for _, c := range []*warc.CustomHTTPClient{globalArchiver.Client, globalArchiver.ClientWithProxy} { + if c != nil { + clients = append(clients, c) + } + } + + return clients +} + +func GetWARCWritingQueueSize() (total int) { + for _, c := range []*warc.CustomHTTPClient{globalArchiver.Client, globalArchiver.ClientWithProxy} { + if c != nil { + total += c.WaitGroup.Size() + } + } + + return total +} diff --git a/main.go b/main.go index ff313626..1be6afc1 100644 --- a/main.go +++ b/main.go @@ -45,20 +45,20 @@ func main() { // Start the reactor that will receive reactorOutputChan := make(chan *models.Item) // err := reactor.Start(config.Get().WorkersCount, reactorOutputChan) - err := reactor.Start(5, reactorOutputChan) + err := reactor.Start(300, reactorOutputChan) if err != nil { logger.Error("error starting reactor", "err", err.Error()) return } // Create mock seeds - seeds := 5 - mockItems := make([]*models.Item, 5) + seeds := 10000 + mockItems := make([]*models.Item, 10000) for i := 0; i < seeds; i++ { uuid := uuid.New() mockItems[i] = &models.Item{ UUID: &uuid, - URL: &models.URL{Raw: fmt.Sprintf("https://www.deezer.fr/%d", i)}, + URL: &models.URL{Raw: fmt.Sprintf("https://www.deezer.com/%d", i)}, Status: models.ItemFresh, Source: models.ItemSourceHQ, } @@ -95,7 +95,7 @@ func main() { for _, seed := range mockItems { err := reactor.ReceiveInsert(seed) if err != nil { - logger.Error("Error queuing seed to source channel", "error", err.Error()) + logger.Error("Error queuing seed to source channel", "err", err.Error()) return } } @@ -103,6 +103,10 @@ func main() { for { time.Sleep(1 * time.Second) if len(reactor.GetStateTable()) == 0 { + for archiver.GetWARCWritingQueueSize() != 0 { + logger.Info("waiting for WARC client(s) to finish writing to disk", "queue_size", archiver.GetWARCWritingQueueSize()) + } + finisher.Stop() postprocessor.Stop() archiver.Stop() From 0e693ef32c6e440b841de30d66387b334ef69470 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Tue, 19 Nov 2024 23:27:58 +0100 Subject: [PATCH 047/295] update .gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 98455fe3..f5920f64 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ Zeno zeno.log .vscode/ *.py -.DS_Store \ No newline at end of file +.DS_Store +ZENO-* \ No newline at end of file From 27c660282501a6675e7a44990944bf7c2fdd322e Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Tue, 19 Nov 2024 23:56:49 +0100 Subject: [PATCH 048/295] source/hq: interfaced with reactor --- internal/pkg/source/hq/consumer.go | 19 +++++++++++------- internal/pkg/source/hq/hq.go | 31 ++++++++++++++++++------------ 2 files changed, 31 insertions(+), 19 deletions(-) diff --git a/internal/pkg/source/hq/consumer.go b/internal/pkg/source/hq/consumer.go index 9c0a375f..aebd9da5 100644 --- a/internal/pkg/source/hq/consumer.go +++ b/internal/pkg/source/hq/consumer.go @@ -5,6 +5,7 @@ import ( "github.com/google/uuid" "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/internetarchive/Zeno/internal/pkg/reactor" "github.com/internetarchive/Zeno/pkg/models" "github.com/internetarchive/gocrawlhq" ) @@ -14,7 +15,7 @@ func consumer() { for { select { - case <-globalHQ.consumerStopChan: + case <-globalHQ.ctx.Done(): // Received signal to stop // Wait for all batch-sending goroutines to finish wg.Wait() @@ -44,13 +45,13 @@ func consumer() { wg.Add(1) // Send the URLs to the reactor in a goroutine - go func(urls []gocrawlhq.URL) { + go func(URLs []gocrawlhq.URL) { defer wg.Done() // Decrement the WaitGroup counter when done - totalURLs := len(urls) - for i, URL := range urls { + totalURLs := len(URLs) + for i, URL := range URLs { UUID := uuid.New() - globalHQ.outputChan <- &models.Item{ + newItem := &models.Item{ UUID: &UUID, URL: &models.URL{ Raw: URL.Value, @@ -60,6 +61,10 @@ func consumer() { Source: models.ItemSourceHQ, } + if err := reactor.ReceiveInsert(newItem); err != nil { + panic("couldn't insert seed in reactor") + } + // When one-third of the URLs are left, send a pre-fetch signal if i == totalURLs-totalURLs/3 { // Send pre-fetch signal to Consumer @@ -72,7 +77,7 @@ func consumer() { // Check if stop signal is received to exit early select { - case <-globalHQ.consumerStopChan: + case <-globalHQ.ctx.Done(): // Stop signal received, exit the goroutine return default: @@ -86,7 +91,7 @@ func consumer() { case <-prefetchSignal: // Received pre-fetch signal; continue to fetch next batch continue - case <-globalHQ.consumerStopChan: + case <-globalHQ.ctx.Done(): // Received signal to stop // Wait for all batch-sending goroutines to finish wg.Wait() diff --git a/internal/pkg/source/hq/hq.go b/internal/pkg/source/hq/hq.go index 545116cd..d5ac9f79 100644 --- a/internal/pkg/source/hq/hq.go +++ b/internal/pkg/source/hq/hq.go @@ -13,13 +13,11 @@ import ( ) type hq struct { - wg sync.WaitGroup - ctx context.Context - cancel context.CancelFunc - inputChan chan *models.Item - outputChan chan *models.Item - client *gocrawlhq.Client - consumerStopChan chan struct{} + wg sync.WaitGroup + ctx context.Context + cancel context.CancelFunc + inputCh chan *models.Item + client *gocrawlhq.Client } var ( @@ -28,7 +26,7 @@ var ( logger *log.FieldedLogger ) -func Start(outputChan chan *models.Item) error { +func Start(inputChan chan *models.Item) error { var done bool log.Start() @@ -41,15 +39,25 @@ func Start(outputChan chan *models.Item) error { once.Do(func() { var err error - globalHQ.client, err = gocrawlhq.Init(config.Get().HQKey, config.Get().HQSecret, config.Get().HQProject, config.Get().HQAddress, "") + ctx, cancel := context.WithCancel(context.Background()) + HQclient, err := gocrawlhq.Init(config.Get().HQKey, config.Get().HQSecret, config.Get().HQProject, config.Get().HQAddress, "") if err != nil { logger.Error("error initializing crawl HQ client", "err", err.Error(), "func", "hq.Start") os.Exit(1) } - globalHQ.wg.Add(2) + globalHQ = &hq{ + wg: sync.WaitGroup{}, + ctx: ctx, + cancel: cancel, + inputCh: inputChan, + client: HQclient, + } + + globalHQ.wg.Add(1) go consumer() - // go Finisher() + // go producer() + // go finisher() done = true }) @@ -64,7 +72,6 @@ func Stop() { if globalHQ != nil { globalHQ.cancel() globalHQ.wg.Wait() - close(globalHQ.outputChan) logger.Info("stopped") } } From cc4b0353c4d58d6c94d15587733bb6fc25b64868 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Wed, 20 Nov 2024 11:01:05 +0100 Subject: [PATCH 049/295] add: redirects & response to URL struct --- .old/internal/pkg/crawl/capture.go | 36 ++++++++++++++-------------- .old/internal/pkg/crawl/utils.go | 9 ------- internal/pkg/postprocessor/utils.go | 10 ++++++++ internal/pkg/source/hq/consumer.go | 4 ++-- internal/pkg/source/hq/utils.go | 6 ++--- internal/pkg/source/hq/utils_test.go | 2 +- pkg/models/url.go | 32 +++++++++++++++++++------ 7 files changed, 59 insertions(+), 40 deletions(-) create mode 100644 internal/pkg/postprocessor/utils.go diff --git a/.old/internal/pkg/crawl/capture.go b/.old/internal/pkg/crawl/capture.go index 0e7308c8..1d0dcaee 100644 --- a/.old/internal/pkg/crawl/capture.go +++ b/.old/internal/pkg/crawl/capture.go @@ -35,24 +35,24 @@ func (c *Crawl) executeGET(item *queue.Item, req *http.Request, isRedirection bo URL *url.URL ) - defer func() { - if c.PrometheusMetrics != nil { - c.PrometheusMetrics.DownloadedURI.Inc() - } - - c.URIsPerSecond.Incr(1) - - if item.Type == "seed" { - c.CrawledSeeds.Incr(1) - } else if item.Type == "asset" { - c.CrawledAssets.Incr(1) - } - }() - - // Check if the crawl is paused - for c.Paused.Get() { - time.Sleep(time.Second) - } + // defer func() { + // if c.PrometheusMetrics != nil { + // c.PrometheusMetrics.DownloadedURI.Inc() + // } + + // c.URIsPerSecond.Incr(1) + + // if item.Type == "seed" { + // c.CrawledSeeds.Incr(1) + // } else if item.Type == "asset" { + // c.CrawledAssets.Incr(1) + // } + // }() + + // // Check if the crawl is paused + // for c.Paused.Get() { + // time.Sleep(time.Second) + // } // Retry on 429 error for retry := uint8(0); retry < c.MaxRetry; retry++ { diff --git a/.old/internal/pkg/crawl/utils.go b/.old/internal/pkg/crawl/utils.go index d59434cd..8a7d718b 100644 --- a/.old/internal/pkg/crawl/utils.go +++ b/.old/internal/pkg/crawl/utils.go @@ -82,12 +82,3 @@ func extractLinksFromText(source string) (links []*url.URL) { // func (c *Crawl) shouldPause(host string) bool { // return c.Frontier.GetActiveHostCount(host) >= c.MaxConcurrentRequestsPerDomain // } - -func isStatusCodeRedirect(statusCode int) bool { - if statusCode == 300 || statusCode == 301 || - statusCode == 302 || statusCode == 307 || - statusCode == 308 { - return true - } - return false -} diff --git a/internal/pkg/postprocessor/utils.go b/internal/pkg/postprocessor/utils.go new file mode 100644 index 00000000..a55f3dbf --- /dev/null +++ b/internal/pkg/postprocessor/utils.go @@ -0,0 +1,10 @@ +package postprocessor + +func isStatusCodeRedirect(statusCode int) bool { + if statusCode == 300 || statusCode == 301 || + statusCode == 302 || statusCode == 307 || + statusCode == 308 { + return true + } + return false +} diff --git a/internal/pkg/source/hq/consumer.go b/internal/pkg/source/hq/consumer.go index aebd9da5..c59efc90 100644 --- a/internal/pkg/source/hq/consumer.go +++ b/internal/pkg/source/hq/consumer.go @@ -54,8 +54,8 @@ func consumer() { newItem := &models.Item{ UUID: &UUID, URL: &models.URL{ - Raw: URL.Value, - Hop: pathToHop(URL.Path), + Raw: URL.Value, + Hops: pathToHops(URL.Path), }, Status: models.ItemFresh, Source: models.ItemSourceHQ, diff --git a/internal/pkg/source/hq/utils.go b/internal/pkg/source/hq/utils.go index 3276611e..7a1d7a1b 100644 --- a/internal/pkg/source/hq/utils.go +++ b/internal/pkg/source/hq/utils.go @@ -1,12 +1,12 @@ package hq -func pathToHop(path string) (hop int) { +func pathToHops(path string) (hops int) { // For each L in the path, add 1 hop for _, c := range path { if c == 'L' { - hop++ + hops++ } } - return hop + return hops } diff --git a/internal/pkg/source/hq/utils_test.go b/internal/pkg/source/hq/utils_test.go index b5533b12..8c24f6c9 100644 --- a/internal/pkg/source/hq/utils_test.go +++ b/internal/pkg/source/hq/utils_test.go @@ -18,7 +18,7 @@ func TestPathToHop(t *testing.T) { } for _, test := range tests { - result := pathToHop(test.path) + result := pathToHops(test.path) if result != test.expected { t.Errorf("For path %q, expected %d hops, but got %d", test.path, test.expected, result) } diff --git a/pkg/models/url.go b/pkg/models/url.go index 3aea0878..3f62df0f 100644 --- a/pkg/models/url.go +++ b/pkg/models/url.go @@ -11,10 +11,12 @@ import ( ) type URL struct { - Raw string - parsed *url.URL - request *http.Request - Hop int // This determines the number of hops this item is the result of, a hop is a "jump" from 1 page to another page + Raw string + parsed *url.URL + request *http.Request + response *http.Response + Hops int // This determines the number of hops this item is the result of, a hop is a "jump" from 1 page to another page + redirects int } func (u *URL) Parse() (err error) { @@ -34,12 +36,28 @@ func (u *URL) GetParsed() *url.URL { return u.parsed } -func (u *URL) SetHop(hop int) { - u.Hop = hop +func (u *URL) SetResponse(r *http.Response) { + u.response = r +} + +func (u *URL) GetResponse() *http.Response { + return u.response +} + +func (u *URL) GetRedirects() int { + return u.redirects +} + +func (u *URL) IncRedirects() { + u.redirects++ +} + +func (u *URL) SetHops(hops int) { + u.Hops = hops } func (u *URL) GetHop() int { - return u.Hop + return u.Hops } // String exists to apply some custom stuff, in opposition of simply From c4cf848e2366f6827dbb041c6b7b02532e5d84db Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Wed, 20 Nov 2024 11:03:35 +0100 Subject: [PATCH 050/295] add: ItemCanceled state --- pkg/models/seed.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/models/seed.go b/pkg/models/seed.go index d5cdf3d9..a2ce525a 100644 --- a/pkg/models/seed.go +++ b/pkg/models/seed.go @@ -27,6 +27,8 @@ const ( ItemCaptured // ItemPostProcessed is the state after the item has been post-processed ItemPostProcessed + // ItemCanceled is the state after the item has been canceled, e.g. cancellation can come from exceeding the redirects limit + ItemCanceled // ItemFailed is the state after the item has failed ItemFailed // ItemCompleted is the state after the item has been completed From e5d65f998548210746252ead6905c276dcf0c590 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Wed, 20 Nov 2024 11:55:25 +0100 Subject: [PATCH 051/295] add: redirection handling --- internal/pkg/archiver/archiver.go | 9 +-- internal/pkg/postprocessor/postprocessor.go | 19 +++++- pkg/models/{seed.go => item.go} | 75 +++++++++++++++++++++ pkg/models/url.go | 8 +-- 4 files changed, 102 insertions(+), 9 deletions(-) rename pkg/models/{seed.go => item.go} (59%) diff --git a/internal/pkg/archiver/archiver.go b/internal/pkg/archiver/archiver.go index 5acf529a..5236b57c 100644 --- a/internal/pkg/archiver/archiver.go +++ b/internal/pkg/archiver/archiver.go @@ -102,6 +102,7 @@ func run() { defer func() { <-guard }() defer stats.ArchiverRoutinesDecr() archive(item) + globalArchiver.outputCh <- item }() } } @@ -120,9 +121,11 @@ func archive(item *models.Item) { // Determines the URLs that need to be captured, if the item's status is fresh we need // to capture the seed, else we need to capture the child URLs (assets), in parallel if item.Status == models.ItemFresh { - URLsToCapture = append(URLsToCapture, item.URL) + URLsToCapture = append(URLsToCapture, item.GetURL()) + } else if item.GetRedirection() != nil { + URLsToCapture = append(URLsToCapture, item.GetRedirection()) } else { - URLsToCapture = item.Childs + URLsToCapture = item.GetChilds() } for _, URL := range URLsToCapture { @@ -160,6 +163,4 @@ func archive(item *models.Item) { } wg.Wait() - - globalArchiver.outputCh <- item } diff --git a/internal/pkg/postprocessor/postprocessor.go b/internal/pkg/postprocessor/postprocessor.go index a7666e09..95be48ee 100644 --- a/internal/pkg/postprocessor/postprocessor.go +++ b/internal/pkg/postprocessor/postprocessor.go @@ -101,5 +101,22 @@ func run() { } func postprocess(item *models.Item) { - // TODO + // Verify if there is any redirection + if isStatusCodeRedirect(item.URL.GetResponse().StatusCode) { + // Check if the current redirections count doesn't exceed the max allowed + if item.URL.GetRedirects() >= config.Get().MaxRedirect { + logger.Warn("max redirects reached", "item", item.UUID.String()) + item.Status = models.ItemCanceled + return + } + + // Prepare the new item resulting from the redirection + item.Redirection = &models.URL{ + Raw: item.URL.GetResponse().Header.Get("Location"), + Redirects: item.URL.GetRedirects() + 1, + Hops: item.URL.GetHops(), + } + + return + } } diff --git a/pkg/models/seed.go b/pkg/models/item.go similarity index 59% rename from pkg/models/seed.go rename to pkg/models/item.go index a2ce525a..f906d87e 100644 --- a/pkg/models/seed.go +++ b/pkg/models/item.go @@ -11,10 +11,83 @@ type Item struct { Status ItemState // Status is the state of the item in the pipeline Source ItemSource // Source is the source of the item in the pipeline ChildsCaptured bool // ChildsCaptured is the flag to indicate if the child URLs of the item have been captured + Redirection *URL // Redirection is the URL that the item has been redirected to, if it's not nil it need to be captured Childs []*URL // Childs is the list of URLs that have been discovered via the item's URL Error error // Error message of the seed } +func NewItem(source ItemSource) (item *Item) { + UUID := uuid.New() + + item = &Item{ + UUID: &UUID, + Status: ItemFresh, + Source: source, + } + + return item +} + +func (i *Item) AddChild(child *URL) { + i.Childs = append(i.Childs, child) +} + +func (i *Item) GetChilds() []*URL { + return i.Childs +} + +func (i *Item) GetUUID() *uuid.UUID { + return i.UUID +} + +func (i *Item) GetURL() *URL { + return i.URL +} + +func (i *Item) GetStatus() ItemState { + return i.Status +} + +func (i *Item) GetSource() ItemSource { + return i.Source +} + +func (i *Item) GetChildsCaptured() bool { + return i.ChildsCaptured +} + +func (i *Item) GetRedirection() *URL { + return i.Redirection +} + +func (i *Item) GetError() error { + return i.Error +} + +func (i *Item) SetURL(url *URL) { + i.URL = url +} + +func (i *Item) SetStatus(status ItemState) { + i.Status = status +} + +func (i *Item) SetSource(source ItemSource) { + i.Source = source +} + +func (i *Item) SetChildsCaptured(captured bool) { + i.ChildsCaptured = captured +} + +func (i *Item) SetRedirection(redirection *URL) { + i.Redirection = redirection +} + +func (i *Item) SetError(err error) { + i.Error = err +} + // ItemState qualifies the state of a item in the pipeline type ItemState int @@ -45,6 +118,8 @@ const ( ItemSourceQueue // ItemSourceHQ is for items that are from the HQ ItemSourceHQ + // ItemSourcePostprocess is for items generated from redirections + ItemSourcePostprocess // ItemSourceFeedback is for items that are from the Feedback ItemSourceFeedback ) diff --git a/pkg/models/url.go b/pkg/models/url.go index 3f62df0f..f21e7e4e 100644 --- a/pkg/models/url.go +++ b/pkg/models/url.go @@ -16,7 +16,7 @@ type URL struct { request *http.Request response *http.Response Hops int // This determines the number of hops this item is the result of, a hop is a "jump" from 1 page to another page - redirects int + Redirects int } func (u *URL) Parse() (err error) { @@ -45,18 +45,18 @@ func (u *URL) GetResponse() *http.Response { } func (u *URL) GetRedirects() int { - return u.redirects + return u.Redirects } func (u *URL) IncRedirects() { - u.redirects++ + u.Redirects++ } func (u *URL) SetHops(hops int) { u.Hops = hops } -func (u *URL) GetHop() int { +func (u *URL) GetHops() int { return u.Hops } From 50d9b628ddc90452da47dd380ceb7bee8885d8bf Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Wed, 20 Nov 2024 12:18:55 +0100 Subject: [PATCH 052/295] change consumer design with fetcher/sender pattern --- internal/pkg/source/hq/consumer.go | 213 +++++++++++++++-------------- internal/pkg/source/hq/hq.go | 121 ++-------------- 2 files changed, 126 insertions(+), 208 deletions(-) diff --git a/internal/pkg/source/hq/consumer.go b/internal/pkg/source/hq/consumer.go index c59efc90..76d54285 100644 --- a/internal/pkg/source/hq/consumer.go +++ b/internal/pkg/source/hq/consumer.go @@ -1,7 +1,9 @@ package hq import ( + "context" "sync" + "time" "github.com/google/uuid" "github.com/internetarchive/Zeno/internal/pkg/config" @@ -11,134 +13,145 @@ import ( ) func consumer() { - var wg sync.WaitGroup // WaitGroup to track batch-sending goroutines + // Create a context to manage goroutines + ctx, cancel := context.WithCancel(globalHQ.ctx) + defer cancel() + // Set the batch size for fetching URLs + batchSize := config.Get().HQBatchSize + + // Create a fixed-size buffer (channel) for URLs + urlBuffer := make(chan *gocrawlhq.URL, batchSize) + + // WaitGroup to wait for goroutines to finish on shutdown + var wg sync.WaitGroup + + // Start the fetcher goroutine(s) + wg.Add(1) + go fetcher(ctx, &wg, urlBuffer, batchSize) + + // Start the sender goroutine(s) + wg.Add(1) + go sender(ctx, &wg, urlBuffer) + + // Wait for shutdown signal + <-globalHQ.ctx.Done() + + // Cancel the context to stop all goroutines + cancel() + + // Wait for all goroutines to finish + wg.Wait() + + // Close the urlBuffer to signal senders to finish + close(urlBuffer) +} + +func fetcher(ctx context.Context, wg *sync.WaitGroup, urlBuffer chan<- *gocrawlhq.URL, batchSize int) { + defer wg.Done() for { + // Check for context cancellation select { - case <-globalHQ.ctx.Done(): - // Received signal to stop - // Wait for all batch-sending goroutines to finish - wg.Wait() + case <-ctx.Done(): return default: - // This is purposely evaluated every time, - // because the value of workers might change - // during the crawl in the future (to be implemented) - var HQBatchSize = config.Get().WorkersCount - - // If a specific HQ batch size is set, use it - if config.Get().HQBatchSize != 0 { - HQBatchSize = config.Get().HQBatchSize - } + } - // Get a batch of URLs from crawl HQ - URLs, err := getURLs(HQBatchSize) - if err != nil { - logger.Error("error getting new URLs from crawl HQ", "err", err.Error(), "func", "hq.Consumer") - continue - } + // Fetch URLs from HQ + URLs, err := getURLs(batchSize) + if err != nil { + logger.Error("error fetching URLs from CrawlHQ", "err", err.Error(), "func", "hq.fetcher") + time.Sleep(250 * time.Millisecond) + continue + } - // Channel to receive pre-fetch signal - prefetchSignal := make(chan struct{}, 1) - - // Increment the WaitGroup counter - wg.Add(1) - - // Send the URLs to the reactor in a goroutine - go func(URLs []gocrawlhq.URL) { - defer wg.Done() // Decrement the WaitGroup counter when done - - totalURLs := len(URLs) - for i, URL := range URLs { - UUID := uuid.New() - newItem := &models.Item{ - UUID: &UUID, - URL: &models.URL{ - Raw: URL.Value, - Hops: pathToHops(URL.Path), - }, - Status: models.ItemFresh, - Source: models.ItemSourceHQ, - } - - if err := reactor.ReceiveInsert(newItem); err != nil { - panic("couldn't insert seed in reactor") - } - - // When one-third of the URLs are left, send a pre-fetch signal - if i == totalURLs-totalURLs/3 { - // Send pre-fetch signal to Consumer - select { - case prefetchSignal <- struct{}{}: - default: - // Signal already sent; do nothing - } - } - - // Check if stop signal is received to exit early - select { - case <-globalHQ.ctx.Done(): - // Stop signal received, exit the goroutine - return - default: - // Continue sending URLs - } - } - }(URLs) - - // Wait for pre-fetch signal or stop signal + // Enqueue URLs into the buffer + for _, URL := range URLs { select { - case <-prefetchSignal: - // Received pre-fetch signal; continue to fetch next batch - continue - case <-globalHQ.ctx.Done(): - // Received signal to stop - // Wait for all batch-sending goroutines to finish - wg.Wait() + case <-ctx.Done(): return + case urlBuffer <- &URL: + } } } } -func getURLs(HQBatchSize int) ([]gocrawlhq.URL, error) { +func sender(ctx context.Context, wg *sync.WaitGroup, urlBuffer <-chan *gocrawlhq.URL) { + defer wg.Done() + for { + select { + case <-ctx.Done(): + return + case URL, ok := <-urlBuffer: + if !ok { + // Channel closed, exit the sender + return + } + + // Process the URL and send to reactor + err := processAndSend(URL) + if err != nil { + panic(err) + } + } + } +} + +func processAndSend(URL *gocrawlhq.URL) error { + UUID := uuid.New() + newItem := &models.Item{ + UUID: &UUID, + URL: &models.URL{ + Raw: URL.Value, + Hops: 0, + }, + Status: models.ItemFresh, + Source: models.ItemSourceHQ, + } + + // Send the item to the reactor + err := reactor.ReceiveInsert(newItem) + if err != nil { + return err + } + return nil +} + +func getURLs(batchSize int) ([]gocrawlhq.URL, error) { + // Fetch URLs from CrawlHQ with optional concurrency if config.Get().HQBatchConcurrency == 1 { - return globalHQ.client.Get(HQBatchSize, config.Get().HQStrategy) + return globalHQ.client.Get(batchSize, config.Get().HQStrategy) } - var mu sync.Mutex var wg sync.WaitGroup - batchSize := HQBatchSize / config.Get().HQBatchConcurrency - URLsChan := make(chan []gocrawlhq.URL, config.Get().HQBatchConcurrency) - var URLs []gocrawlhq.URL + concurrency := config.Get().HQBatchConcurrency + subBatchSize := batchSize / concurrency + urlsChan := make(chan []gocrawlhq.URL, concurrency) + var allURLs []gocrawlhq.URL - // Start goroutines to get URLs from crawl HQ, each will request - // HQBatchSize / HQConcurrentBatch URLs - for i := 0; i < config.Get().HQBatchConcurrency; i++ { + // Start concurrent fetches + for i := 0; i < concurrency; i++ { wg.Add(1) go func() { defer wg.Done() - URLs, err := globalHQ.client.Get(batchSize, config.Get().HQStrategy) + URLs, err := globalHQ.client.Get(subBatchSize, config.Get().HQStrategy) if err != nil { - logger.Error("error getting new URLs from crawl HQ", "err", err.Error(), "func", "hq.getURLs") + logger.Error("error fetching URLs from CrawlHQ", "err", err.Error(), "func", "hq.getURLs") return } - URLsChan <- URLs + urlsChan <- URLs }() } - // Wait for all goroutines to finish - go func() { - wg.Wait() - close(URLsChan) - }() - - // Collect all URLs from the channels - for URLsFromChan := range URLsChan { - mu.Lock() - URLs = append(URLs, URLsFromChan...) - mu.Unlock() + // Wait for all fetches to complete + wg.Wait() + close(urlsChan) + + // Collect URLs from all fetches + for URLs := range urlsChan { + allURLs = append(allURLs, URLs...) } - return URLs, nil + return allURLs, nil } diff --git a/internal/pkg/source/hq/hq.go b/internal/pkg/source/hq/hq.go index d5ac9f79..9f28dcdb 100644 --- a/internal/pkg/source/hq/hq.go +++ b/internal/pkg/source/hq/hq.go @@ -13,11 +13,12 @@ import ( ) type hq struct { - wg sync.WaitGroup - ctx context.Context - cancel context.CancelFunc - inputCh chan *models.Item - client *gocrawlhq.Client + wg sync.WaitGroup + ctx context.Context + cancel context.CancelFunc + finishCh chan *models.Item + producedCh chan *models.Item + client *gocrawlhq.Client } var ( @@ -26,7 +27,7 @@ var ( logger *log.FieldedLogger ) -func Start(inputChan chan *models.Item) error { +func Start(finishChan, producedChan chan *models.Item) error { var done bool log.Start() @@ -47,11 +48,12 @@ func Start(inputChan chan *models.Item) error { } globalHQ = &hq{ - wg: sync.WaitGroup{}, - ctx: ctx, - cancel: cancel, - inputCh: inputChan, - client: HQclient, + wg: sync.WaitGroup{}, + ctx: ctx, + cancel: cancel, + finishCh: finishChan, + producedCh: producedChan, + client: HQclient, } globalHQ.wg.Add(1) @@ -76,103 +78,6 @@ func Stop() { } } -// func HQProducer() { -// defer c.HQChannelsWg.Done() - -// var ( -// discoveredArray = []gocrawlhq.URL{} -// mutex = sync.Mutex{} -// terminateProducer = make(chan bool) -// ) - -// // the discoveredArray is sent to the crawl HQ every 10 seconds -// // or when it reaches a certain size -// go func() { -// HQLastSent := time.Now() - -// for { -// select { -// case <-terminateProducer: -// // no need to lock the mutex here, because the producer channel -// // is already closed, so no other goroutine can write to the slice -// if len(discoveredArray) > 0 { -// for { -// err := c.HQClient.Add(discoveredArray, false) -// if err != nil { -// c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error sending payload to crawl HQ, waiting 1s then retrying..") -// time.Sleep(time.Second) -// continue -// } -// break -// } -// } - -// return -// default: -// mutex.Lock() -// if (len(discoveredArray) >= int(math.Ceil(float64(c.Workers.Count)/2)) || time.Since(HQLastSent) >= time.Second*10) && len(discoveredArray) > 0 { -// for { -// err := c.HQClient.Add(discoveredArray, false) -// if err != nil { -// c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error sending payload to crawl HQ, waiting 1s then retrying..") -// time.Sleep(time.Second) -// continue -// } -// break -// } - -// discoveredArray = []gocrawlhq.URL{} -// HQLastSent = time.Now() -// } -// mutex.Unlock() -// } -// } -// }() - -// // listen to the discovered channel and add the URLs to the discoveredArray -// for discoveredItem := range c.HQProducerChannel { -// var via string - -// if discoveredItem.ParentURL != nil { -// via = utils.URLToString(discoveredItem.ParentURL) -// } - -// discoveredURL := gocrawlhq.URL{ -// Value: utils.URLToString(discoveredItem.URL), -// Via: via, -// } - -// for i := uint64(0); i < discoveredItem.Hop; i++ { -// discoveredURL.Path += "L" -// } - -// // The reason we are using a string instead of a bool is because -// // gob's encode/decode doesn't properly support booleans -// if discoveredItem.BypassSeencheck { -// for { -// err := c.HQClient.Add([]gocrawlhq.URL{discoveredURL}, true) -// if err != nil { -// c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ -// "bypassSeencheck": discoveredItem.BypassSeencheck, -// })).Error("error sending payload to crawl HQ, waiting 1s then retrying..") -// time.Sleep(time.Second) -// continue -// } -// break -// } -// continue -// } - -// mutex.Lock() -// discoveredArray = append(discoveredArray, discoveredURL) -// mutex.Unlock() -// } - -// // if we are here, it means that the HQProducerChannel has been closed -// // so we need to send the last payload to the crawl HQ -// terminateProducer <- true -// } - // func HQFinisher() { // defer c.HQChannelsWg.Done() From dd2689b83dceac2f43cc73c3eb0fecadaac41421 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Wed, 20 Nov 2024 13:00:18 +0100 Subject: [PATCH 053/295] replace *uuid.UUID to string in models/item --- internal/pkg/archiver/archiver.go | 2 +- internal/pkg/finisher/finisher.go | 10 +- internal/pkg/postprocessor/postprocessor.go | 4 +- internal/pkg/preprocessor/preprocessor.go | 2 +- internal/pkg/reactor/reactor.go | 8 +- internal/pkg/reactor/reactor_test.go | 6 +- internal/pkg/source/hq/consumer.go | 7 +- internal/pkg/source/hq/finisher.go | 169 ++++++++++++++++++++ internal/pkg/source/hq/producer.go | 106 ++++++++++++ internal/pkg/source/hq/utils.go | 9 ++ main.go | 4 +- pkg/models/item.go | 11 +- 12 files changed, 311 insertions(+), 27 deletions(-) create mode 100644 internal/pkg/source/hq/finisher.go create mode 100644 internal/pkg/source/hq/producer.go diff --git a/internal/pkg/archiver/archiver.go b/internal/pkg/archiver/archiver.go index 5236b57c..1b1c30fe 100644 --- a/internal/pkg/archiver/archiver.go +++ b/internal/pkg/archiver/archiver.go @@ -93,7 +93,7 @@ func run() { return case item, ok := <-globalArchiver.inputCh: if ok { - logger.Info("received item", "item", item.UUID.String()) + logger.Info("received item", "item", item.ID) guard <- struct{}{} wg.Add(1) stats.ArchiverRoutinesIncr() diff --git a/internal/pkg/finisher/finisher.go b/internal/pkg/finisher/finisher.go index 4376a988..a05bdff3 100644 --- a/internal/pkg/finisher/finisher.go +++ b/internal/pkg/finisher/finisher.go @@ -80,26 +80,26 @@ func (f *finisher) run() { panic("received nil item") } - logger.Debug("received item", "item", item.UUID.String()) + logger.Debug("received item", "item", item.ID) if item.Error != nil { - logger.Error("received item with error", "item", item.UUID.String(), "error", item.Error) + logger.Error("received item with error", "item", item.ID, "error", item.Error) f.errorCh <- item continue } reactor.MarkAsFinished(item) - logger.Info("item finished", "item", item.UUID.String()) + logger.Info("item finished", "item", item.ID) case item := <-f.errorCh: if item == nil { panic("received nil item") } - logger.Info("received item with error", "item", item.UUID.String(), "error", item.Error) + logger.Info("received item with error", "item", item.ID, "error", item.Error) reactor.MarkAsFinished(item) - logger.Info("item with error finished", "item", item.UUID.String()) + logger.Info("item with error finished", "item", item.ID) } } } diff --git a/internal/pkg/postprocessor/postprocessor.go b/internal/pkg/postprocessor/postprocessor.go index 95be48ee..bda9764f 100644 --- a/internal/pkg/postprocessor/postprocessor.go +++ b/internal/pkg/postprocessor/postprocessor.go @@ -84,7 +84,7 @@ func run() { return case item, ok := <-globalPostprocessor.inputCh: if ok { - logger.Info("received item", "item", item.UUID.String()) + logger.Info("received item", "item", item.ID) guard <- struct{}{} wg.Add(1) stats.PostprocessorRoutinesIncr() @@ -105,7 +105,7 @@ func postprocess(item *models.Item) { if isStatusCodeRedirect(item.URL.GetResponse().StatusCode) { // Check if the current redirections count doesn't exceed the max allowed if item.URL.GetRedirects() >= config.Get().MaxRedirect { - logger.Warn("max redirects reached", "item", item.UUID.String()) + logger.Warn("max redirects reached", "item", item.ID) item.Status = models.ItemCanceled return } diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go index c63a69d8..ca7edd31 100644 --- a/internal/pkg/preprocessor/preprocessor.go +++ b/internal/pkg/preprocessor/preprocessor.go @@ -87,7 +87,7 @@ func run() { return case item, ok := <-globalPreprocessor.inputCh: if ok { - logger.Info("received item", "item", item.UUID.String()) + logger.Info("received item", "item", item.ID) guard <- struct{}{} wg.Add(1) stats.PreprocessorRoutinesIncr() diff --git a/internal/pkg/reactor/reactor.go b/internal/pkg/reactor/reactor.go index 204ed3a4..767ccb57 100644 --- a/internal/pkg/reactor/reactor.go +++ b/internal/pkg/reactor/reactor.go @@ -82,7 +82,7 @@ func ReceiveFeedback(item *models.Item) error { } item.Source = models.ItemSourceFeedback - _, loaded := globalReactor.stateTable.Swap(item.UUID.String(), item) + _, loaded := globalReactor.stateTable.Swap(item.ID, item) if !loaded { // An item sent to the feedback channel should be present on the state table, if not present reactor should error out return ErrFeedbackItemNotPresent @@ -98,7 +98,7 @@ func ReceiveFeedback(item *models.Item) error { // ReceiveInsert sends an item to the input channel consuming a token. // It is the responsibility of the sender to set either ItemSourceQueue or ItemSourceHQ, if not set seed will get forced ItemSourceInsert func ReceiveInsert(item *models.Item) error { - logger.Info("received seed", "seed", item.UUID.String()) + logger.Info("received seed", "seed", item.ID) if globalReactor == nil { return ErrReactorNotInitialized } @@ -108,7 +108,7 @@ func ReceiveInsert(item *models.Item) error { if item.Source != models.ItemSourceQueue && item.Source != models.ItemSourceHQ { item.Source = models.ItemSourceInsert } - globalReactor.stateTable.Store(item.UUID.String(), item) + globalReactor.stateTable.Store(item.ID, item) globalReactor.input <- item return nil case <-globalReactor.ctx.Done(): @@ -122,7 +122,7 @@ func MarkAsFinished(item *models.Item) error { return ErrReactorNotInitialized } - if _, loaded := globalReactor.stateTable.LoadAndDelete(item.UUID.String()); loaded { + if _, loaded := globalReactor.stateTable.LoadAndDelete(item.ID); loaded { <-globalReactor.tokenPool return nil } diff --git a/internal/pkg/reactor/reactor_test.go b/internal/pkg/reactor/reactor_test.go index f995de83..9b3c4944 100644 --- a/internal/pkg/reactor/reactor_test.go +++ b/internal/pkg/reactor/reactor_test.go @@ -61,7 +61,7 @@ func _testerFunc(tokens, consumers, seeds int, t testing.TB) { if item.Source != models.ItemSourceFeedback { err := ReceiveFeedback(item) if err != nil { - fatalChan <- fmt.Errorf("Error sending feedback: %s - %s", err, item.UUID.String()) + fatalChan <- fmt.Errorf("Error sending feedback: %s - %s", err, item.ID) } continue } @@ -82,9 +82,9 @@ func _testerFunc(tokens, consumers, seeds int, t testing.TB) { // Create mock seeds mockItems := []*models.Item{} for i := 0; i <= seeds; i++ { - uuid := uuid.New() + uuid := uuid.New().String() mockItems = append(mockItems, &models.Item{ - UUID: &uuid, + ID: uuid, URL: &models.URL{Raw: fmt.Sprintf("http://example.com/%d", i)}, Status: models.ItemFresh, Source: models.ItemSourceHQ, diff --git a/internal/pkg/source/hq/consumer.go b/internal/pkg/source/hq/consumer.go index 76d54285..74314ba2 100644 --- a/internal/pkg/source/hq/consumer.go +++ b/internal/pkg/source/hq/consumer.go @@ -5,7 +5,6 @@ import ( "sync" "time" - "github.com/google/uuid" "github.com/internetarchive/Zeno/internal/pkg/config" "github.com/internetarchive/Zeno/internal/pkg/reactor" "github.com/internetarchive/Zeno/pkg/models" @@ -99,13 +98,13 @@ func sender(ctx context.Context, wg *sync.WaitGroup, urlBuffer <-chan *gocrawlhq } func processAndSend(URL *gocrawlhq.URL) error { - UUID := uuid.New() newItem := &models.Item{ - UUID: &UUID, + ID: URL.ID, URL: &models.URL{ Raw: URL.Value, - Hops: 0, + Hops: pathToHops(URL.Path), }, + Via: URL.Via, Status: models.ItemFresh, Source: models.ItemSourceHQ, } diff --git a/internal/pkg/source/hq/finisher.go b/internal/pkg/source/hq/finisher.go new file mode 100644 index 00000000..80501f9a --- /dev/null +++ b/internal/pkg/source/hq/finisher.go @@ -0,0 +1,169 @@ +package hq + +// import ( +// "context" +// "sync" +// "time" + +// "github.com/internetarchive/Zeno/internal/pkg/config" +// "github.com/internetarchive/Zeno/internal/pkg/log" +// "github.com/internetarchive/gocrawlhq" +// ) + +// var ( +// // batchCh is a buffered channel that holds batches ready to be sent to HQ. +// // Its capacity is set to the maximum number of sender routines. +// batchCh chan []*gocrawlhq.URL +// ) + +// // finisher initializes and starts the finisher and dispatcher processes. +// func finisher() { +// var wg sync.WaitGroup + +// maxSenders := getMaxFinishSenders() +// batchCh = make(chan []*gocrawlhq.URL, maxSenders) + +// wg.Add(1) +// go receiver(ctx, &wg) + +// wg.Add(1) +// go dispatcher(ctx, &wg) + +// // Wait for the context to be canceled. +// <-ctx.Done() + +// // Wait for the finisher and dispatcher to finish. +// wg.Wait() +// } + +// // finishReceiver reads URLs from finishCh, accumulates them into batches, and sends the batches to batchCh. +// func finishReceiver(ctx context.Context, wg *sync.WaitGroup) { +// defer wg.Done() + +// logger := log.NewFieldedLogger(&log.Fields{ +// "component": "hq/finishReceiver", +// }) + +// batchSize := getBatchSize() +// maxWaitTime := 5 * time.Second + +// batch := make([]*gocrawlhq.URL, 0, batchSize) +// timer := time.NewTimer(maxWaitTime) +// defer timer.Stop() + +// for { +// select { +// case <-ctx.Done(): +// // Send any remaining URLs. +// if len(batch) > 0 { +// batchCh <- batch // Blocks if batchCh is full. +// } +// return +// case url := <-globalHQ.finishCh: +// URLToSend := &gocrawlhq.URL{ +// ID: url.ID, +// } +// batch = append(batch, &URLToSend) +// if len(batch) >= batchSize { +// // Send the batch to batchCh. +// batchCh <- batch // Blocks if batchCh is full. +// batch = make([]gocrawlhq.URL, 0, batchSize) +// resetTimer(timer, maxWaitTime) +// } +// case <-timer.C: +// if len(batch) > 0 { +// batchCh <- batch // Blocks if batchCh is full. +// batch = make([]gocrawlhq.URL, 0, batchSize) +// } +// resetTimer(timer, maxWaitTime) +// } +// } +// } + +// // finishDispatcher receives batches from batchCh and dispatches them to sender routines. +// func finishDispatcher(ctx context.Context, wg *sync.WaitGroup) { +// defer wg.Done() + +// logger := log.NewFieldedLogger(&log.Fields{ +// "component": "hq/dispatcher", +// }) + +// maxSenders := getMaxFinishSenders() +// senderSemaphore := make(chan struct{}, maxSenders) +// var senderWg sync.WaitGroup + +// for { +// select { +// case batch := <-batchCh: +// senderSemaphore <- struct{}{} // Blocks if maxSenders reached. +// senderWg.Add(1) +// go func(batch []gocrawlhq.URL) { +// defer senderWg.Done() +// defer func() { <-senderSemaphore }() +// finishSender(ctx, batch) +// }(batch) +// case <-ctx.Done(): +// // Wait for all sender routines to finish. +// senderWg.Wait() +// return +// } +// } +// } + +// // finishSender sends a batch of URLs to HQ with retries and exponential backoff. +// func finishSender(ctx context.Context, batch []gocrawlhq.URL) { +// logger := log.NewFieldedLogger(&log.Fields{ +// "component": "hq/finishSender", +// }) + +// backoff := time.Second +// maxBackoff := 5 * time.Second + +// for { +// err := globalHQ.client.Delete(batch) +// select { +// case <-ctx.Done(): +// return +// default: +// if err != nil { +// logger.Error("Error sending batch to HQ", "err", err) +// time.Sleep(backoff) +// backoff *= 2 +// if backoff > maxBackoff { +// backoff = maxBackoff +// } +// continue +// } +// return +// } +// } +// } + +// // resetTimer safely resets the timer to the specified duration. +// func resetTimer(timer *time.Timer, duration time.Duration) { +// if !timer.Stop() { +// select { +// case <-timer.C: +// default: +// } +// } +// timer.Reset(duration) +// } + +// // getMaxFinishSenders returns the maximum number of sender routines based on configuration. +// func getMaxFinishSenders() int { +// workersCount := config.Get().WorkersCount +// if workersCount < 10 { +// return 1 +// } +// return workersCount / 10 +// } + +// // getBatchSize returns the batch size based on configuration. +// func getBatchSize() int { +// batchSize := config.Get().HQBatchSize +// if batchSize == 0 { +// batchSize = 100 // Default batch size. +// } +// return batchSize +// } diff --git a/internal/pkg/source/hq/producer.go b/internal/pkg/source/hq/producer.go new file mode 100644 index 00000000..4156b218 --- /dev/null +++ b/internal/pkg/source/hq/producer.go @@ -0,0 +1,106 @@ +package hq + +// import ( +// "math" +// "sync" +// "time" + +// "github.com/internetarchive/Zeno/internal/pkg/log" +// "github.com/internetarchive/gocrawlhq" +// ) + +// func producer() { +// var ( +// wg sync.WaitGroup +// logger = log.NewFieldedLogger(&log.Fields{ +// "component": "hq/producer", +// }) +// ) + +// // the discoveredArray is sent to the crawl HQ every 10 seconds +// // or when it reaches a certain size +// go func() { +// HQLastSent := time.Now() + +// for { +// select { +// case <-globalHQ.ctx.Done(): +// // no need to lock the mutex here, because the producer channel +// // is already closed, so no other goroutine can write to the slice +// if len(discoveredArray) > 0 { +// for { +// err := globalHQ.client.Add(discoveredArray, false) +// if err != nil { +// logger.Error("error sending payload to crawl HQ, waiting 1s then retrying..") +// time.Sleep(time.Second) +// continue +// } +// break +// } +// } + +// return +// default: +// mutex.Lock() +// if (len(discoveredArray) >= int(math.Ceil(float64(c.Workers.Count)/2)) || time.Since(HQLastSent) >= time.Second*10) && len(discoveredArray) > 0 { +// for { +// err := c.HQClient.Add(discoveredArray, false) +// if err != nil { +// c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error sending payload to crawl HQ, waiting 1s then retrying..") +// time.Sleep(time.Second) +// continue +// } +// break +// } + +// discoveredArray = []gocrawlhq.URL{} +// HQLastSent = time.Now() +// } +// mutex.Unlock() +// } +// } +// }() + +// // listen to the discovered channel and add the URLs to the discoveredArray +// for discoveredItem := range c.HQProducerChannel { +// var via string + +// if discoveredItem.ParentURL != nil { +// via = utils.URLToString(discoveredItem.ParentURL) +// } + +// discoveredURL := gocrawlhq.URL{ +// Value: utils.URLToString(discoveredItem.URL), +// Via: via, +// } + +// for i := uint64(0); i < discoveredItem.Hop; i++ { +// discoveredURL.Path += "L" +// } + +// // The reason we are using a string instead of a bool is because +// // gob's encode/decode doesn't properly support booleans +// if discoveredItem.BypassSeencheck { +// for { +// err := c.HQClient.Add([]gocrawlhq.URL{discoveredURL}, true) +// if err != nil { +// c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ +// "bypassSeencheck": discoveredItem.BypassSeencheck, +// })).Error("error sending payload to crawl HQ, waiting 1s then retrying..") +// time.Sleep(time.Second) +// continue +// } +// break +// } +// continue +// } + +// mutex.Lock() +// discoveredArray = append(discoveredArray, discoveredURL) +// mutex.Unlock() +// } + +// // if we are here, it means that the HQProducerChannel has been closed +// // so we need to send the last payload to the crawl HQ +// terminateProducer <- true +// } diff --git a/internal/pkg/source/hq/utils.go b/internal/pkg/source/hq/utils.go index 7a1d7a1b..fed48e30 100644 --- a/internal/pkg/source/hq/utils.go +++ b/internal/pkg/source/hq/utils.go @@ -10,3 +10,12 @@ func pathToHops(path string) (hops int) { return hops } + +func hopsToPath(hops int) (path string) { + // For each hop, add an L to the path + for i := 0; i < hops; i++ { + path += "L" + } + + return path +} diff --git a/main.go b/main.go index 1be6afc1..890b16ef 100644 --- a/main.go +++ b/main.go @@ -55,9 +55,9 @@ func main() { seeds := 10000 mockItems := make([]*models.Item, 10000) for i := 0; i < seeds; i++ { - uuid := uuid.New() + uuid := uuid.New().String() mockItems[i] = &models.Item{ - UUID: &uuid, + ID: uuid, URL: &models.URL{Raw: fmt.Sprintf("https://www.deezer.com/%d", i)}, Status: models.ItemFresh, Source: models.ItemSourceHQ, diff --git a/pkg/models/item.go b/pkg/models/item.go index f906d87e..e1eccf1e 100644 --- a/pkg/models/item.go +++ b/pkg/models/item.go @@ -6,21 +6,22 @@ import ( // Item represents a URL, it's childs (e.g. discovered assets) and it's state in the pipeline type Item struct { - UUID *uuid.UUID // UUID is the unique identifier of the item + ID string // ID is the unique identifier of the item URL *URL // URL is a struct that contains the URL, the parsed URL, and its hop Status ItemState // Status is the state of the item in the pipeline Source ItemSource // Source is the source of the item in the pipeline ChildsCaptured bool // ChildsCaptured is the flag to indicate if the child URLs of the item have been captured Redirection *URL // Redirection is the URL that the item has been redirected to, if it's not nil it need to be captured + Via string // Via is the URL that the item has been found from Childs []*URL // Childs is the list of URLs that have been discovered via the item's URL Error error // Error message of the seed } func NewItem(source ItemSource) (item *Item) { - UUID := uuid.New() + UUID := uuid.New().String() item = &Item{ - UUID: &UUID, + ID: UUID, Status: ItemFresh, Source: source, } @@ -36,8 +37,8 @@ func (i *Item) GetChilds() []*URL { return i.Childs } -func (i *Item) GetUUID() *uuid.UUID { - return i.UUID +func (i *Item) GetID() string { + return i.ID } func (i *Item) GetURL() *URL { From 42bd8b20af62ee8c2381fc0672ab7f55fd015fd2 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Wed, 20 Nov 2024 13:03:09 +0100 Subject: [PATCH 054/295] add: embed resp redirection in URL struct --- cmd/get.go | 1 + cmd/get_url.go | 23 +++++++++++ internal/pkg/archiver/archiver.go | 24 +++++++++-- internal/pkg/config/config.go | 1 - internal/pkg/postprocessor/postprocessor.go | 7 ++-- internal/pkg/preprocessor/preprocessor.go | 17 +++++++- main.go | 45 +++++++++++---------- pkg/models/item.go | 2 - 8 files changed, 88 insertions(+), 32 deletions(-) create mode 100644 cmd/get_url.go diff --git a/cmd/get.go b/cmd/get.go index 5f17726e..850ecc2b 100644 --- a/cmd/get.go +++ b/cmd/get.go @@ -18,6 +18,7 @@ func getCMDs() *cobra.Command { getCMDsFlags(getCmd) getHQCmdFlags(getHQCmd) + getCmd.AddCommand(getURLCmd) getCmd.AddCommand(getHQCmd) return getCmd diff --git a/cmd/get_url.go b/cmd/get_url.go new file mode 100644 index 00000000..2f74f769 --- /dev/null +++ b/cmd/get_url.go @@ -0,0 +1,23 @@ +package cmd + +import ( + "fmt" + + "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/spf13/cobra" +) + +var getURLCmd = &cobra.Command{ + Use: "url [URL...]", + Short: "Archive given URLs", + Args: cobra.MinimumNArgs(1), + PreRunE: func(cmd *cobra.Command, args []string) error { + if cfg == nil { + return fmt.Errorf("viper config is nil") + } + return nil + }, + RunE: func(cmd *cobra.Command, args []string) error { + return config.GenerateCrawlConfig() + }, +} diff --git a/internal/pkg/archiver/archiver.go b/internal/pkg/archiver/archiver.go index 1b1c30fe..3760f2b1 100644 --- a/internal/pkg/archiver/archiver.go +++ b/internal/pkg/archiver/archiver.go @@ -116,14 +116,19 @@ func archive(item *models.Item) { URLsToCapture []*models.URL guard = make(chan struct{}, config.Get().MaxConcurrentAssets) wg sync.WaitGroup + itemState = models.ItemCaptured ) // Determines the URLs that need to be captured, if the item's status is fresh we need - // to capture the seed, else we need to capture the child URLs (assets), in parallel - if item.Status == models.ItemFresh { + // to capture the seed, else if it's a redirection we need to captue it, and + // else we need to capture the child URLs (assets), in parallel + if item.GetStatus() == models.ItemPreProcessed { URLsToCapture = append(URLsToCapture, item.GetURL()) } else if item.GetRedirection() != nil { URLsToCapture = append(URLsToCapture, item.GetRedirection()) + // We want to nil the redirection field when the capture of the redirection is done, we + // will eventually fill it back in postprocess if this capture leads to another redirection + defer item.SetRedirection(nil) } else { URLsToCapture = item.GetChilds() } @@ -131,7 +136,7 @@ func archive(item *models.Item) { for _, URL := range URLsToCapture { guard <- struct{}{} wg.Add(1) - go func() { + go func(URL *models.URL) { defer wg.Done() defer func() { <-guard }() @@ -147,9 +152,18 @@ func archive(item *models.Item) { } if err != nil { logger.Error("unable to execute request", "err", err.Error(), "func", "archiver.archive") + + // Only mark the item as failed if we were processing a redirection or a new seed + if item.GetStatus() == models.ItemFresh || item.GetRedirection() != nil { + itemState = models.ItemFailed + } + return } + // Set the response in the item + URL.SetResponse(resp) + if resp.StatusCode != 200 { logger.Warn("non-200 status code", "status_code", resp.StatusCode) } @@ -159,8 +173,10 @@ func archive(item *models.Item) { if err != nil { logger.Error("unable to consume response body", "url", URL.String(), "err", err.Error(), "func", "archiver.archive") } - }() + }(URL) } wg.Wait() + + item.SetStatus(itemState) } diff --git a/internal/pkg/config/config.go b/internal/pkg/config/config.go index 464c2f13..8867bdd1 100644 --- a/internal/pkg/config/config.go +++ b/internal/pkg/config/config.go @@ -196,7 +196,6 @@ func GenerateCrawlConfig() error { } config.JobPath = path.Join("jobs", config.Job) - config.UseSeencheck = !config.DisableSeencheck // Defaults --max-crawl-time-limit to 10% more than --crawl-time-limit diff --git a/internal/pkg/postprocessor/postprocessor.go b/internal/pkg/postprocessor/postprocessor.go index bda9764f..4a5e60cb 100644 --- a/internal/pkg/postprocessor/postprocessor.go +++ b/internal/pkg/postprocessor/postprocessor.go @@ -101,21 +101,22 @@ func run() { } func postprocess(item *models.Item) { + defer item.SetStatus(models.ItemPostProcessed) + // Verify if there is any redirection if isStatusCodeRedirect(item.URL.GetResponse().StatusCode) { // Check if the current redirections count doesn't exceed the max allowed if item.URL.GetRedirects() >= config.Get().MaxRedirect { logger.Warn("max redirects reached", "item", item.ID) - item.Status = models.ItemCanceled return } // Prepare the new item resulting from the redirection - item.Redirection = &models.URL{ + item.SetRedirection(&models.URL{ Raw: item.URL.GetResponse().Header.Get("Location"), Redirects: item.URL.GetRedirects() + 1, Hops: item.URL.GetHops(), - } + }) return } diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go index ca7edd31..cf979748 100644 --- a/internal/pkg/preprocessor/preprocessor.go +++ b/internal/pkg/preprocessor/preprocessor.go @@ -104,6 +104,8 @@ func run() { } func preprocess(item *models.Item) { + defer item.SetStatus(models.ItemPreProcessed) + // Validate the URL of either the item itself and/or its childs // TODO: if an error happen and it's a fresh item, we should mark it as failed in HQ (if it's a HQ-based crawl) @@ -114,7 +116,7 @@ func preprocess(item *models.Item) { ) // Validate the URLs, either the item's URL or its childs if it has any - if item.Status == models.ItemFresh { + if item.GetStatus() == models.ItemFresh { URLType = "seed" // Validate the item's URL itself @@ -124,6 +126,19 @@ func preprocess(item *models.Item) { return } + if config.Get().UseSeencheck { + URLsToSeencheck = append(URLsToSeencheck, item.URL) + } + } else if item.GetRedirection() != nil { + URLType = "seed" + + // Validate the item's URL itself + err = normalizeURL(item.GetURL(), nil) + if err != nil { + logger.Warn("unable to validate URL", "url", item.URL.Raw, "err", err.Error(), "func", "preprocessor.preprocessor") + return + } + if config.Get().UseSeencheck { URLsToSeencheck = append(URLsToSeencheck, item.URL) } diff --git a/main.go b/main.go index 890b16ef..f304fcbd 100644 --- a/main.go +++ b/main.go @@ -9,17 +9,17 @@ package main import ( - "fmt" - "os" "time" "github.com/google/uuid" "github.com/internetarchive/Zeno/cmd" "github.com/internetarchive/Zeno/internal/pkg/archiver" + "github.com/internetarchive/Zeno/internal/pkg/config" "github.com/internetarchive/Zeno/internal/pkg/finisher" "github.com/internetarchive/Zeno/internal/pkg/log" "github.com/internetarchive/Zeno/internal/pkg/postprocessor" "github.com/internetarchive/Zeno/internal/pkg/preprocessor" + "github.com/internetarchive/Zeno/internal/pkg/preprocessor/seencheck" "github.com/internetarchive/Zeno/internal/pkg/reactor" "github.com/internetarchive/Zeno/pkg/models" ) @@ -31,37 +31,40 @@ var ( func main() { log.Start() logger = log.NewFieldedLogger(&log.Fields{ - "component": "preprocessor", + "component": "main", }) defer log.Stop() if err := cmd.Run(); err != nil { - fmt.Println(err) - os.Exit(1) + logger.Error("unable to run root command", "err", err.Error()) + return + } + + // If needed, start the seencheck process + if config.Get().UseSeencheck { + err := seencheck.Start(config.Get().JobPath) + if err != nil { + logger.Error("unable to start seencheck", "err", err.Error()) + return + } } seedErrorChan := make(chan *models.Item) // Start the reactor that will receive reactorOutputChan := make(chan *models.Item) - // err := reactor.Start(config.Get().WorkersCount, reactorOutputChan) - err := reactor.Start(300, reactorOutputChan) - if err != nil { - logger.Error("error starting reactor", "err", err.Error()) - return - } + err := reactor.Start(config.Get().WorkersCount, reactorOutputChan) // Create mock seeds - seeds := 10000 - mockItems := make([]*models.Item, 10000) - for i := 0; i < seeds; i++ { - uuid := uuid.New().String() - mockItems[i] = &models.Item{ - ID: uuid, - URL: &models.URL{Raw: fmt.Sprintf("https://www.deezer.com/%d", i)}, - Status: models.ItemFresh, - Source: models.ItemSourceHQ, - } + mockItems := make([]*models.Item, 1) + URL := "http://www.youtube.com/watch?v=stUqfrc1EFE" + UUID := uuid.New() + + mockItems[0] = &models.Item{ + ID: UUID.String(), + URL: &models.URL{Raw: URL}, + Status: models.ItemFresh, + Source: models.ItemSourceHQ, } preprocessorOutputChan := make(chan *models.Item) diff --git a/pkg/models/item.go b/pkg/models/item.go index e1eccf1e..66cd493d 100644 --- a/pkg/models/item.go +++ b/pkg/models/item.go @@ -101,8 +101,6 @@ const ( ItemCaptured // ItemPostProcessed is the state after the item has been post-processed ItemPostProcessed - // ItemCanceled is the state after the item has been canceled, e.g. cancellation can come from exceeding the redirects limit - ItemCanceled // ItemFailed is the state after the item has failed ItemFailed // ItemCompleted is the state after the item has been completed From c6a814ec43f5643ca3f6d0a2695e2709e08b5c02 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Wed, 20 Nov 2024 13:22:49 +0100 Subject: [PATCH 055/295] hq: add finisher --- internal/pkg/source/hq/finisher.go | 343 +++++++++++++++-------------- 1 file changed, 176 insertions(+), 167 deletions(-) diff --git a/internal/pkg/source/hq/finisher.go b/internal/pkg/source/hq/finisher.go index 80501f9a..0c03e942 100644 --- a/internal/pkg/source/hq/finisher.go +++ b/internal/pkg/source/hq/finisher.go @@ -1,169 +1,178 @@ package hq -// import ( -// "context" -// "sync" -// "time" - -// "github.com/internetarchive/Zeno/internal/pkg/config" -// "github.com/internetarchive/Zeno/internal/pkg/log" -// "github.com/internetarchive/gocrawlhq" -// ) - -// var ( -// // batchCh is a buffered channel that holds batches ready to be sent to HQ. -// // Its capacity is set to the maximum number of sender routines. -// batchCh chan []*gocrawlhq.URL -// ) - -// // finisher initializes and starts the finisher and dispatcher processes. -// func finisher() { -// var wg sync.WaitGroup - -// maxSenders := getMaxFinishSenders() -// batchCh = make(chan []*gocrawlhq.URL, maxSenders) - -// wg.Add(1) -// go receiver(ctx, &wg) - -// wg.Add(1) -// go dispatcher(ctx, &wg) - -// // Wait for the context to be canceled. -// <-ctx.Done() - -// // Wait for the finisher and dispatcher to finish. -// wg.Wait() -// } - -// // finishReceiver reads URLs from finishCh, accumulates them into batches, and sends the batches to batchCh. -// func finishReceiver(ctx context.Context, wg *sync.WaitGroup) { -// defer wg.Done() - -// logger := log.NewFieldedLogger(&log.Fields{ -// "component": "hq/finishReceiver", -// }) - -// batchSize := getBatchSize() -// maxWaitTime := 5 * time.Second - -// batch := make([]*gocrawlhq.URL, 0, batchSize) -// timer := time.NewTimer(maxWaitTime) -// defer timer.Stop() - -// for { -// select { -// case <-ctx.Done(): -// // Send any remaining URLs. -// if len(batch) > 0 { -// batchCh <- batch // Blocks if batchCh is full. -// } -// return -// case url := <-globalHQ.finishCh: -// URLToSend := &gocrawlhq.URL{ -// ID: url.ID, -// } -// batch = append(batch, &URLToSend) -// if len(batch) >= batchSize { -// // Send the batch to batchCh. -// batchCh <- batch // Blocks if batchCh is full. -// batch = make([]gocrawlhq.URL, 0, batchSize) -// resetTimer(timer, maxWaitTime) -// } -// case <-timer.C: -// if len(batch) > 0 { -// batchCh <- batch // Blocks if batchCh is full. -// batch = make([]gocrawlhq.URL, 0, batchSize) -// } -// resetTimer(timer, maxWaitTime) -// } -// } -// } - -// // finishDispatcher receives batches from batchCh and dispatches them to sender routines. -// func finishDispatcher(ctx context.Context, wg *sync.WaitGroup) { -// defer wg.Done() - -// logger := log.NewFieldedLogger(&log.Fields{ -// "component": "hq/dispatcher", -// }) - -// maxSenders := getMaxFinishSenders() -// senderSemaphore := make(chan struct{}, maxSenders) -// var senderWg sync.WaitGroup - -// for { -// select { -// case batch := <-batchCh: -// senderSemaphore <- struct{}{} // Blocks if maxSenders reached. -// senderWg.Add(1) -// go func(batch []gocrawlhq.URL) { -// defer senderWg.Done() -// defer func() { <-senderSemaphore }() -// finishSender(ctx, batch) -// }(batch) -// case <-ctx.Done(): -// // Wait for all sender routines to finish. -// senderWg.Wait() -// return -// } -// } -// } - -// // finishSender sends a batch of URLs to HQ with retries and exponential backoff. -// func finishSender(ctx context.Context, batch []gocrawlhq.URL) { -// logger := log.NewFieldedLogger(&log.Fields{ -// "component": "hq/finishSender", -// }) - -// backoff := time.Second -// maxBackoff := 5 * time.Second - -// for { -// err := globalHQ.client.Delete(batch) -// select { -// case <-ctx.Done(): -// return -// default: -// if err != nil { -// logger.Error("Error sending batch to HQ", "err", err) -// time.Sleep(backoff) -// backoff *= 2 -// if backoff > maxBackoff { -// backoff = maxBackoff -// } -// continue -// } -// return -// } -// } -// } - -// // resetTimer safely resets the timer to the specified duration. -// func resetTimer(timer *time.Timer, duration time.Duration) { -// if !timer.Stop() { -// select { -// case <-timer.C: -// default: -// } -// } -// timer.Reset(duration) -// } - -// // getMaxFinishSenders returns the maximum number of sender routines based on configuration. -// func getMaxFinishSenders() int { -// workersCount := config.Get().WorkersCount -// if workersCount < 10 { -// return 1 -// } -// return workersCount / 10 -// } - -// // getBatchSize returns the batch size based on configuration. -// func getBatchSize() int { -// batchSize := config.Get().HQBatchSize -// if batchSize == 0 { -// batchSize = 100 // Default batch size. -// } -// return batchSize -// } +import ( + "context" + "sync" + "time" + + "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/internetarchive/Zeno/internal/pkg/log" + "github.com/internetarchive/gocrawlhq" +) + +type finishBatch struct { + URLs []gocrawlhq.URL + ChildsCaptured int +} + +// finisher initializes and starts the finisher and dispatcher processes. +func finisher() { + // Create a context to manage goroutines + ctx, cancel := context.WithCancel(globalHQ.ctx) + defer cancel() + + maxSenders := getMaxFinishSenders() + batchCh := make(chan *finishBatch, maxSenders) + + var wg sync.WaitGroup + + wg.Add(1) + go finishReceiver(ctx, &wg, batchCh) + + wg.Add(1) + go finishDispatcher(ctx, &wg, batchCh) + + // Wait for the context to be canceled. + <-ctx.Done() + + // Wait for the finisher and dispatcher to finish. + wg.Wait() +} + +// finishReceiver reads URLs from finishCh, accumulates them into batches, and sends the batches to batchCh. +func finishReceiver(ctx context.Context, wg *sync.WaitGroup, batchCh chan *finishBatch) { + defer wg.Done() + + logger := log.NewFieldedLogger(&log.Fields{ + "component": "hq.finishReceiver", + }) + + batchSize := getBatchSize() + maxWaitTime := 5 * time.Second + + batch := &finishBatch{ + URLs: make([]gocrawlhq.URL, 0, batchSize), + } + timer := time.NewTimer(maxWaitTime) + defer timer.Stop() + + for { + select { + case <-ctx.Done(): + // Send any remaining URLs. + if len(batch.URLs) > 0 { + logger.Debug("while closing sending remaining batch to dispatcher", "size", len(batch.URLs)) + batchCh <- batch // Blocks if batchCh is full. + } + return + case url := <-globalHQ.finishCh: + URLToSend := gocrawlhq.URL{ + ID: url.ID, + } + batch.URLs = append(batch.URLs, URLToSend) + if len(batch.URLs) >= batchSize { + logger.Debug("sending batch to dispatcher", "size", len(batch.URLs)) + // Send the batch to batchCh. + batchCh <- batch // Blocks if batchCh is full. + batch.URLs = make([]gocrawlhq.URL, 0, batchSize) + resetTimer(timer, maxWaitTime) + } + case <-timer.C: + if len(batch.URLs) > 0 { + logger.Debug("sending non-full batch to dispatcher", "size", len(batch.URLs)) + batchCh <- batch // Blocks if batchCh is full. + batch.URLs = make([]gocrawlhq.URL, 0, batchSize) + } + resetTimer(timer, maxWaitTime) + } + } +} + +// finishDispatcher receives batches from batchCh and dispatches them to sender routines. +func finishDispatcher(ctx context.Context, wg *sync.WaitGroup, batchCh chan *finishBatch) { + defer wg.Done() + + logger := log.NewFieldedLogger(&log.Fields{ + "component": "hq.finishDispatcher", + }) + + maxSenders := getMaxFinishSenders() + senderSemaphore := make(chan struct{}, maxSenders) + var senderWg sync.WaitGroup + + for { + select { + case batch := <-batchCh: + senderSemaphore <- struct{}{} // Blocks if maxSenders reached. + senderWg.Add(1) + logger.Debug("dispatching batch to sender", "size", len(batch.URLs)) + go func(batch *finishBatch) { + defer senderWg.Done() + defer func() { <-senderSemaphore }() + finishSender(ctx, batch) + }(batch) + case <-ctx.Done(): + // Wait for all sender routines to finish. + senderWg.Wait() + return + } + } +} + +// finishSender sends a batch of URLs to HQ with retries and exponential backoff. +func finishSender(ctx context.Context, batch *finishBatch) { + logger := log.NewFieldedLogger(&log.Fields{ + "component": "hq.finishSender", + }) + + backoff := time.Second + maxBackoff := 5 * time.Second + + for { + err := globalHQ.client.Delete(batch.URLs, batch.ChildsCaptured) + select { + case <-ctx.Done(): + return + default: + if err != nil { + logger.Error("Error sending batch to HQ", "err", err) + time.Sleep(backoff) + backoff *= 2 + if backoff > maxBackoff { + backoff = maxBackoff + } + continue + } + return + } + } +} + +// resetTimer safely resets the timer to the specified duration. +func resetTimer(timer *time.Timer, duration time.Duration) { + if !timer.Stop() { + select { + case <-timer.C: + default: + } + } + timer.Reset(duration) +} + +// getMaxFinishSenders returns the maximum number of sender routines based on configuration. +func getMaxFinishSenders() int { + workersCount := config.Get().WorkersCount + if workersCount < 10 { + return 1 + } + return workersCount / 10 +} + +// getBatchSize returns the batch size based on configuration. +func getBatchSize() int { + batchSize := config.Get().WorkersCount + if batchSize == 0 { + batchSize = 100 // Default batch size. + } + return batchSize +} From 47cbc24403dfff960d6be7297aa0e6c5c9975894 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Wed, 20 Nov 2024 13:30:00 +0100 Subject: [PATCH 056/295] amend last commit: forgot to cancel sub-routines context --- internal/pkg/source/hq/finisher.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/internal/pkg/source/hq/finisher.go b/internal/pkg/source/hq/finisher.go index 0c03e942..5e472d3e 100644 --- a/internal/pkg/source/hq/finisher.go +++ b/internal/pkg/source/hq/finisher.go @@ -35,6 +35,9 @@ func finisher() { // Wait for the context to be canceled. <-ctx.Done() + // Cancel the context to stop all goroutines. + cancel() + // Wait for the finisher and dispatcher to finish. wg.Wait() } From 07805363f9047b168d7d92fad39ea24c30b431cf Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Wed, 20 Nov 2024 14:12:55 +0100 Subject: [PATCH 057/295] hq: producer implemented and other fixes --- internal/pkg/source/hq/finisher.go | 21 +-- internal/pkg/source/hq/hq.go | 26 +-- internal/pkg/source/hq/producer.go | 274 ++++++++++++++++++----------- internal/pkg/source/hq/utils.go | 13 ++ 4 files changed, 201 insertions(+), 133 deletions(-) diff --git a/internal/pkg/source/hq/finisher.go b/internal/pkg/source/hq/finisher.go index 5e472d3e..06d8419b 100644 --- a/internal/pkg/source/hq/finisher.go +++ b/internal/pkg/source/hq/finisher.go @@ -68,11 +68,11 @@ func finishReceiver(ctx context.Context, wg *sync.WaitGroup, batchCh chan *finis batchCh <- batch // Blocks if batchCh is full. } return - case url := <-globalHQ.finishCh: - URLToSend := gocrawlhq.URL{ - ID: url.ID, + case item := <-globalHQ.finishCh: + URL := gocrawlhq.URL{ + ID: item.ID, } - batch.URLs = append(batch.URLs, URLToSend) + batch.URLs = append(batch.URLs, URL) if len(batch.URLs) >= batchSize { logger.Debug("sending batch to dispatcher", "size", len(batch.URLs)) // Send the batch to batchCh. @@ -138,7 +138,7 @@ func finishSender(ctx context.Context, batch *finishBatch) { return default: if err != nil { - logger.Error("Error sending batch to HQ", "err", err) + logger.Error("error sending batch to HQ", "err", err) time.Sleep(backoff) backoff *= 2 if backoff > maxBackoff { @@ -151,17 +151,6 @@ func finishSender(ctx context.Context, batch *finishBatch) { } } -// resetTimer safely resets the timer to the specified duration. -func resetTimer(timer *time.Timer, duration time.Duration) { - if !timer.Stop() { - select { - case <-timer.C: - default: - } - } - timer.Reset(duration) -} - // getMaxFinishSenders returns the maximum number of sender routines based on configuration. func getMaxFinishSenders() int { workersCount := config.Get().WorkersCount diff --git a/internal/pkg/source/hq/hq.go b/internal/pkg/source/hq/hq.go index 9f28dcdb..e7962ace 100644 --- a/internal/pkg/source/hq/hq.go +++ b/internal/pkg/source/hq/hq.go @@ -13,12 +13,12 @@ import ( ) type hq struct { - wg sync.WaitGroup - ctx context.Context - cancel context.CancelFunc - finishCh chan *models.Item - producedCh chan *models.Item - client *gocrawlhq.Client + wg sync.WaitGroup + ctx context.Context + cancel context.CancelFunc + finishCh chan *models.Item + produceCh chan *models.Item + client *gocrawlhq.Client } var ( @@ -27,7 +27,7 @@ var ( logger *log.FieldedLogger ) -func Start(finishChan, producedChan chan *models.Item) error { +func Start(finishChan, produceChan chan *models.Item) error { var done bool log.Start() @@ -48,12 +48,12 @@ func Start(finishChan, producedChan chan *models.Item) error { } globalHQ = &hq{ - wg: sync.WaitGroup{}, - ctx: ctx, - cancel: cancel, - finishCh: finishChan, - producedCh: producedChan, - client: HQclient, + wg: sync.WaitGroup{}, + ctx: ctx, + cancel: cancel, + finishCh: finishChan, + produceCh: produceChan, + client: HQclient, } globalHQ.wg.Add(1) diff --git a/internal/pkg/source/hq/producer.go b/internal/pkg/source/hq/producer.go index 4156b218..bb3773cc 100644 --- a/internal/pkg/source/hq/producer.go +++ b/internal/pkg/source/hq/producer.go @@ -1,106 +1,172 @@ package hq -// import ( -// "math" -// "sync" -// "time" - -// "github.com/internetarchive/Zeno/internal/pkg/log" -// "github.com/internetarchive/gocrawlhq" -// ) - -// func producer() { -// var ( -// wg sync.WaitGroup -// logger = log.NewFieldedLogger(&log.Fields{ -// "component": "hq/producer", -// }) -// ) - -// // the discoveredArray is sent to the crawl HQ every 10 seconds -// // or when it reaches a certain size -// go func() { -// HQLastSent := time.Now() - -// for { -// select { -// case <-globalHQ.ctx.Done(): -// // no need to lock the mutex here, because the producer channel -// // is already closed, so no other goroutine can write to the slice -// if len(discoveredArray) > 0 { -// for { -// err := globalHQ.client.Add(discoveredArray, false) -// if err != nil { -// logger.Error("error sending payload to crawl HQ, waiting 1s then retrying..") -// time.Sleep(time.Second) -// continue -// } -// break -// } -// } - -// return -// default: -// mutex.Lock() -// if (len(discoveredArray) >= int(math.Ceil(float64(c.Workers.Count)/2)) || time.Since(HQLastSent) >= time.Second*10) && len(discoveredArray) > 0 { -// for { -// err := c.HQClient.Add(discoveredArray, false) -// if err != nil { -// c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error sending payload to crawl HQ, waiting 1s then retrying..") -// time.Sleep(time.Second) -// continue -// } -// break -// } - -// discoveredArray = []gocrawlhq.URL{} -// HQLastSent = time.Now() -// } -// mutex.Unlock() -// } -// } -// }() - -// // listen to the discovered channel and add the URLs to the discoveredArray -// for discoveredItem := range c.HQProducerChannel { -// var via string - -// if discoveredItem.ParentURL != nil { -// via = utils.URLToString(discoveredItem.ParentURL) -// } - -// discoveredURL := gocrawlhq.URL{ -// Value: utils.URLToString(discoveredItem.URL), -// Via: via, -// } - -// for i := uint64(0); i < discoveredItem.Hop; i++ { -// discoveredURL.Path += "L" -// } - -// // The reason we are using a string instead of a bool is because -// // gob's encode/decode doesn't properly support booleans -// if discoveredItem.BypassSeencheck { -// for { -// err := c.HQClient.Add([]gocrawlhq.URL{discoveredURL}, true) -// if err != nil { -// c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ -// "bypassSeencheck": discoveredItem.BypassSeencheck, -// })).Error("error sending payload to crawl HQ, waiting 1s then retrying..") -// time.Sleep(time.Second) -// continue -// } -// break -// } -// continue -// } - -// mutex.Lock() -// discoveredArray = append(discoveredArray, discoveredURL) -// mutex.Unlock() -// } - -// // if we are here, it means that the HQProducerChannel has been closed -// // so we need to send the last payload to the crawl HQ -// terminateProducer <- true -// } +import ( + "context" + "sync" + "time" + + "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/internetarchive/Zeno/internal/pkg/log" + "github.com/internetarchive/gocrawlhq" +) + +// producerBatch represents a batch of URLs to be added to HQ. +type producerBatch struct { + URLs []gocrawlhq.URL +} + +// producer initializes and starts the producer and dispatcher processes. +func producer() { + // Create a context to manage goroutines + ctx, cancel := context.WithCancel(globalHQ.ctx) + defer cancel() + + maxSenders := getMaxProducerSenders() + batchCh := make(chan *producerBatch, maxSenders) + + var wg sync.WaitGroup + + wg.Add(1) + go producerReceiver(ctx, &wg, batchCh) + + wg.Add(1) + go producerDispatcher(ctx, &wg, batchCh) + + // Wait for the context to be canceled. + <-ctx.Done() + + // Cancel the context to stop all goroutines. + cancel() + + // Wait for the producer and dispatcher to finish. + wg.Wait() +} + +// producerReceiver reads URLs from produceCh, accumulates them into batches, and sends the batches to batchCh. +func producerReceiver(ctx context.Context, wg *sync.WaitGroup, batchCh chan *producerBatch) { + defer wg.Done() + + logger := log.NewFieldedLogger(&log.Fields{ + "component": "hq.producerReceiver", + }) + + batchSize := getProducerBatchSize() + maxWaitTime := 5 * time.Second + + batch := &producerBatch{ + URLs: make([]gocrawlhq.URL, 0, batchSize), + } + timer := time.NewTimer(maxWaitTime) + defer timer.Stop() + + for { + select { + case <-ctx.Done(): + // Send any remaining URLs. + if len(batch.URLs) > 0 { + logger.Debug("while closing, sending remaining batch to dispatcher", "size", len(batch.URLs)) + batchCh <- batch // Blocks if batchCh is full. + } + return + case item := <-globalHQ.produceCh: + URL := gocrawlhq.URL{ + Value: item.URL.String(), + Via: item.Via, + Path: hopsToPath(item.URL.GetHops()), + } + batch.URLs = append(batch.URLs, URL) + if len(batch.URLs) >= batchSize { + logger.Debug("sending batch to dispatcher", "size", len(batch.URLs)) + // Send the batch to batchCh. + batchCh <- batch // Blocks if batchCh is full. + batch.URLs = make([]gocrawlhq.URL, 0, batchSize) + resetTimer(timer, maxWaitTime) + } + case <-timer.C: + if len(batch.URLs) > 0 { + logger.Debug("sending non-full batch to dispatcher", "size", len(batch.URLs)) + batchCh <- batch // Blocks if batchCh is full. + batch.URLs = make([]gocrawlhq.URL, 0, batchSize) + } + resetTimer(timer, maxWaitTime) + } + } +} + +// producerDispatcher receives batches from batchCh and dispatches them to sender routines. +func producerDispatcher(ctx context.Context, wg *sync.WaitGroup, batchCh chan *producerBatch) { + defer wg.Done() + + logger := log.NewFieldedLogger(&log.Fields{ + "component": "hq.producerDispatcher", + }) + + maxSenders := getMaxProducerSenders() + senderSemaphore := make(chan struct{}, maxSenders) + var senderWg sync.WaitGroup + + for { + select { + case batch := <-batchCh: + senderSemaphore <- struct{}{} // Blocks if maxSenders reached. + senderWg.Add(1) + logger.Debug("dispatching batch to sender", "size", len(batch.URLs)) + go func(batch *producerBatch) { + defer senderWg.Done() + defer func() { <-senderSemaphore }() + producerSender(ctx, batch) + }(batch) + case <-ctx.Done(): + // Wait for all sender routines to finish. + senderWg.Wait() + return + } + } +} + +// producerSender sends a batch of URLs to HQ with retries and exponential backoff. +func producerSender(ctx context.Context, batch *producerBatch) { + logger := log.NewFieldedLogger(&log.Fields{ + "component": "hq.producerSender", + }) + + backoff := time.Second + maxBackoff := 5 * time.Second + + for { + err := globalHQ.client.Add(batch.URLs, false) // Use bypassSeencheck = false + select { + case <-ctx.Done(): + return + default: + if err != nil { + logger.Error("error sending batch to HQ", "err", err) + time.Sleep(backoff) + backoff *= 2 + if backoff > maxBackoff { + backoff = maxBackoff + } + continue + } + return + } + } +} + +// getMaxProducerSenders returns the maximum number of sender routines based on configuration. +func getMaxProducerSenders() int { + workersCount := config.Get().WorkersCount + if workersCount < 10 { + return 1 + } + return workersCount / 10 +} + +// getProducerBatchSize returns the batch size based on configuration. +func getProducerBatchSize() int { + batchSize := config.Get().HQBatchSize + if batchSize == 0 { + batchSize = 100 // Default batch size. + } + return batchSize +} diff --git a/internal/pkg/source/hq/utils.go b/internal/pkg/source/hq/utils.go index fed48e30..474b92b6 100644 --- a/internal/pkg/source/hq/utils.go +++ b/internal/pkg/source/hq/utils.go @@ -1,5 +1,7 @@ package hq +import "time" + func pathToHops(path string) (hops int) { // For each L in the path, add 1 hop for _, c := range path { @@ -19,3 +21,14 @@ func hopsToPath(hops int) (path string) { return path } + +// resetTimer safely resets the timer to the specified duration. +func resetTimer(timer *time.Timer, duration time.Duration) { + if !timer.Stop() { + select { + case <-timer.C: + default: + } + } + timer.Reset(duration) +} From bd60c3bcffdf3dd2246b8ef35cf3e6b6a77353f2 Mon Sep 17 00:00:00 2001 From: Will Howes Date: Wed, 20 Nov 2024 14:29:42 +0100 Subject: [PATCH 058/295] write normalizeURL function --- go.mod | 1 + go.sum | 2 + internal/pkg/preprocessor/url.go | 35 +++++++++++++- internal/pkg/preprocessor/url_test.go | 66 +++++++++++++++++++++++++++ 4 files changed, 102 insertions(+), 2 deletions(-) create mode 100644 internal/pkg/preprocessor/url_test.go diff --git a/go.mod b/go.mod index 8045fe1d..f873fe26 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,7 @@ go 1.23.3 require ( github.com/CorentinB/warc v0.8.53 + github.com/ada-url/goada v0.0.0-20240402045241-5e45a5777313 github.com/elastic/go-elasticsearch v0.0.0 github.com/elastic/go-elasticsearch/v7 v7.17.10 github.com/google/uuid v1.6.0 diff --git a/go.sum b/go.sum index 7f815386..95aa41fe 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,7 @@ github.com/CorentinB/warc v0.8.53 h1:xVz3RMdZ6faAqTtLfcK1/yl8ZTansy+B2en//EZLUlM= github.com/CorentinB/warc v0.8.53/go.mod h1:NblONkMtoBB4TIigew6F6vakzu0z3YQTKNFS8U2FIn8= +github.com/ada-url/goada v0.0.0-20240402045241-5e45a5777313 h1:jdPBTZ3nZwBBZzz5SCFUMcTxoZr8t9ogwdvD3P27f/E= +github.com/ada-url/goada v0.0.0-20240402045241-5e45a5777313/go.mod h1:+D/veNwI2mA1hDYLVrYSobYcLFWm6e3DJ/H/d/dxlu8= github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= diff --git a/internal/pkg/preprocessor/url.go b/internal/pkg/preprocessor/url.go index 84be7696..0f22face 100644 --- a/internal/pkg/preprocessor/url.go +++ b/internal/pkg/preprocessor/url.go @@ -1,8 +1,39 @@ package preprocessor -import "github.com/internetarchive/Zeno/pkg/models" +import ( + "fmt" + "net/url" + + "github.com/ada-url/goada" + "github.com/internetarchive/Zeno/pkg/models" +) func normalizeURL(URL *models.URL, parentURL *models.URL) (err error) { - // Validate the URL, REMOVE FRAGMENTS, try to fix it, make it absolute if needed, etc. + // Normalize the URL by removing fragments, attempting to add URL scheme if missing, + // and converting relative URLs into absolute URLs. An error is returned if the URL + // cannot be normalized. + + var ada_result *goada.Url + if parentURL == nil { + parsedURL, err := url.Parse(URL.Raw) + if err != nil { + return err + } + fmt.Println(parsedURL.Scheme) + if parsedURL.Scheme == "" { + parsedURL.Scheme = "http" + } + ada_result, err = goada.New(parsedURL.String()) + if err != nil { + return err + } + } else { + ada_result, err = goada.NewWithBase(URL.Raw, parentURL.Raw) + if err != nil { + return err + } + } + ada_result.SetHash("") + URL.Raw = ada_result.Href() return URL.Parse() } diff --git a/internal/pkg/preprocessor/url_test.go b/internal/pkg/preprocessor/url_test.go new file mode 100644 index 00000000..991fa8bc --- /dev/null +++ b/internal/pkg/preprocessor/url_test.go @@ -0,0 +1,66 @@ +package preprocessor + +import ( + "testing" + + "github.com/internetarchive/Zeno/pkg/models" +) + +func TestNormalizeURL(t *testing.T) { + tests := []struct { + name string + rawURL string + parentURL string + wantErr bool + expectedURL string + }{ + { + name: "valid absolute URL", + rawURL: "https://example.com/path", + wantErr: false, + expectedURL: "https://example.com/path", + }, + { + name: "valid relative URL with parent", + rawURL: "/path", + parentURL: "https://example.com", + wantErr: false, + expectedURL: "https://example.com/path", + }, + { + name: "invalid URL", + rawURL: "://invalid-url", + wantErr: true, + }, + { + name: "valid URL without scheme", + rawURL: "www.google.com", + wantErr: false, + expectedURL: "http://www.google.com/", + }, + { + name: "valid URL with path without scheme", + rawURL: "www.google.com/dogs", + wantErr: false, + expectedURL: "http://www.google.com/dogs", + }, + } + + for _, tt := range tests { + // TODO: add support for nil value of parentURL + t.Run(tt.name, func(t *testing.T) { + url := &models.URL{Raw: tt.rawURL} + var parentURL *models.URL + if tt.parentURL != "" { + parentURL = &models.URL{Raw: tt.parentURL} + } + err := normalizeURL(url, parentURL) + if (err != nil) != tt.wantErr { + t.Errorf("normalizeURL() error = %v, wantErr %v", err, tt.wantErr) + } + if !tt.wantErr && url.Raw != tt.expectedURL { + t.Errorf("normalizeURL() got = %v, want %v", url.Raw, tt.expectedURL) + } + }) + } +} From 6e84df754c53f75939f536275343e682dd3a0d29 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Wed, 20 Nov 2024 14:14:52 +0100 Subject: [PATCH 059/295] hq: reset the once when stopped --- internal/pkg/source/hq/hq.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/internal/pkg/source/hq/hq.go b/internal/pkg/source/hq/hq.go index e7962ace..23155e90 100644 --- a/internal/pkg/source/hq/hq.go +++ b/internal/pkg/source/hq/hq.go @@ -56,10 +56,10 @@ func Start(finishChan, produceChan chan *models.Item) error { client: HQclient, } - globalHQ.wg.Add(1) + globalHQ.wg.Add(3) go consumer() - // go producer() - // go finisher() + go producer() + go finisher() done = true }) @@ -74,6 +74,7 @@ func Stop() { if globalHQ != nil { globalHQ.cancel() globalHQ.wg.Wait() + once = sync.Once{} logger.Info("stopped") } } From d48a301daaddfe843bcf1d9ef7c0b7ad02345852 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Wed, 20 Nov 2024 14:31:19 +0100 Subject: [PATCH 060/295] finisher: add conditions ; global: update main --- internal/pkg/finisher/finisher.go | 57 +++++++++++++++++++------- main.go | 66 ++++++++++++++----------------- 2 files changed, 71 insertions(+), 52 deletions(-) diff --git a/internal/pkg/finisher/finisher.go b/internal/pkg/finisher/finisher.go index a05bdff3..f16e7637 100644 --- a/internal/pkg/finisher/finisher.go +++ b/internal/pkg/finisher/finisher.go @@ -10,11 +10,13 @@ import ( ) type finisher struct { - ctx context.Context - cancel context.CancelFunc - inputCh chan *models.Item - errorCh chan *models.Item - wg sync.WaitGroup + ctx context.Context + cancel context.CancelFunc + inputCh chan *models.Item + errorCh chan *models.Item + sourceFinishedCh chan *models.Item + sourceProducedCh chan *models.Item + wg sync.WaitGroup } var ( @@ -25,7 +27,7 @@ var ( // Start initializes the global finisher with the given input channel. // This method can only be called once. -func Start(inputChan, errorChan chan *models.Item) error { +func Start(inputChan, errorChan, sourceFinishedChan, sourceProducedChan chan *models.Item) error { var done bool log.Start() @@ -36,10 +38,13 @@ func Start(inputChan, errorChan chan *models.Item) error { once.Do(func() { ctx, cancel := context.WithCancel(context.Background()) globalFinisher = &finisher{ - ctx: ctx, - cancel: cancel, - inputCh: inputChan, - errorCh: errorChan, + ctx: ctx, + cancel: cancel, + inputCh: inputChan, + errorCh: errorChan, + sourceFinishedCh: sourceFinishedChan, + sourceProducedCh: sourceProducedChan, + wg: sync.WaitGroup{}, } logger.Debug("initialized") globalFinisher.wg.Add(1) @@ -82,24 +87,46 @@ func (f *finisher) run() { logger.Debug("received item", "item", item.ID) if item.Error != nil { - logger.Error("received item with error", "item", item.ID, "error", item.Error) + logger.Error("received item with error", "item", item.ID, "err", item.Error) f.errorCh <- item continue } - reactor.MarkAsFinished(item) + if item.GetStatus() == models.ItemFresh { + logger.Debug("fresh item received", "item", item) + f.sourceProducedCh <- item + } else if item.GetRedirection() != nil { + logger.Debug("item has redirection", "item", item.ID) + err := reactor.ReceiveFeedback(item) + if err != nil { + panic(err) + } + } else if len(item.GetChilds()) != 0 { + logger.Debug("item has children", "item", item.ID) + err := reactor.ReceiveFeedback(item) + if err != nil { + panic(err) + } + } else { + logger.Debug("item has no redirection or children", "item", item.ID) + err := reactor.MarkAsFinished(item) + if err != nil { + panic(err) + } + f.sourceFinishedCh <- item + } - logger.Info("item finished", "item", item.ID) + logger.Debug("item finished", "item", item.ID) case item := <-f.errorCh: if item == nil { panic("received nil item") } - logger.Info("received item with error", "item", item.ID, "error", item.Error) + logger.Info("received item with error", "item", item.ID, "err", item.Error) reactor.MarkAsFinished(item) - logger.Info("item with error finished", "item", item.ID) + logger.Debug("item with error finished", "item", item.ID) } } } diff --git a/main.go b/main.go index f304fcbd..ea00f82d 100644 --- a/main.go +++ b/main.go @@ -9,9 +9,10 @@ package main import ( - "time" + "os" + "os/signal" + "syscall" - "github.com/google/uuid" "github.com/internetarchive/Zeno/cmd" "github.com/internetarchive/Zeno/internal/pkg/archiver" "github.com/internetarchive/Zeno/internal/pkg/config" @@ -21,6 +22,7 @@ import ( "github.com/internetarchive/Zeno/internal/pkg/preprocessor" "github.com/internetarchive/Zeno/internal/pkg/preprocessor/seencheck" "github.com/internetarchive/Zeno/internal/pkg/reactor" + "github.com/internetarchive/Zeno/internal/pkg/source/hq" "github.com/internetarchive/Zeno/pkg/models" ) @@ -55,18 +57,6 @@ func main() { reactorOutputChan := make(chan *models.Item) err := reactor.Start(config.Get().WorkersCount, reactorOutputChan) - // Create mock seeds - mockItems := make([]*models.Item, 1) - URL := "http://www.youtube.com/watch?v=stUqfrc1EFE" - UUID := uuid.New() - - mockItems[0] = &models.Item{ - ID: UUID.String(), - URL: &models.URL{Raw: URL}, - Status: models.ItemFresh, - Source: models.ItemSourceHQ, - } - preprocessorOutputChan := make(chan *models.Item) err = preprocessor.Start(reactorOutputChan, preprocessorOutputChan, seedErrorChan) if err != nil { @@ -88,34 +78,36 @@ func main() { return } - err = finisher.Start(postprocessorOutputChan, seedErrorChan) + hqFinishChan := make(chan *models.Item) + hqProduceChan := make(chan *models.Item) + err = hq.Start(hqFinishChan, hqProduceChan) if err != nil { - logger.Error("error starting finisher", "err", err.Error()) + logger.Error("error starting hq", "err", err.Error()) return } - // Queue mock seeds to the source channel - for _, seed := range mockItems { - err := reactor.ReceiveInsert(seed) - if err != nil { - logger.Error("Error queuing seed to source channel", "err", err.Error()) - return - } + err = finisher.Start(postprocessorOutputChan, seedErrorChan, hqFinishChan, hqProduceChan) + if err != nil { + logger.Error("error starting finisher", "err", err.Error()) + return } - for { - time.Sleep(1 * time.Second) - if len(reactor.GetStateTable()) == 0 { - for archiver.GetWARCWritingQueueSize() != 0 { - logger.Info("waiting for WARC client(s) to finish writing to disk", "queue_size", archiver.GetWARCWritingQueueSize()) - } - - finisher.Stop() - postprocessor.Stop() - archiver.Stop() - preprocessor.Stop() - reactor.Stop() - return - } + // Handle OS signals for graceful shutdown + signalChan := make(chan os.Signal, 1) + signal.Notify(signalChan, syscall.SIGINT, syscall.SIGTERM) + + select { + case <-signalChan: + logger.Info("received shutdown signal, stopping services...") + case item := <-seedErrorChan: + logger.Error("received error from seedErrorChan", "err", item.GetError()) } + + finisher.Stop() + hq.Stop() + postprocessor.Stop() + archiver.Stop() + preprocessor.Stop() + reactor.Stop() + logger.Info("all services stopped, exiting") } From a9e3419557b61c7294dc3b24a153dca4481f5acd Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Wed, 20 Nov 2024 14:39:49 +0100 Subject: [PATCH 061/295] remove: seencheck on fresh item --- internal/pkg/postprocessor/postprocessor.go | 5 +++++ internal/pkg/preprocessor/preprocessor.go | 4 ---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/internal/pkg/postprocessor/postprocessor.go b/internal/pkg/postprocessor/postprocessor.go index 4a5e60cb..f21d71d2 100644 --- a/internal/pkg/postprocessor/postprocessor.go +++ b/internal/pkg/postprocessor/postprocessor.go @@ -103,8 +103,11 @@ func run() { func postprocess(item *models.Item) { defer item.SetStatus(models.ItemPostProcessed) + // TODO: execute assets redirection + // Verify if there is any redirection if isStatusCodeRedirect(item.URL.GetResponse().StatusCode) { + logger.Info("detected redirect", "url", item.URL.String()) // Check if the current redirections count doesn't exceed the max allowed if item.URL.GetRedirects() >= config.Get().MaxRedirect { logger.Warn("max redirects reached", "item", item.ID) @@ -119,5 +122,7 @@ func postprocess(item *models.Item) { }) return + } else { + logger.Info("no redirect", "url", item.URL.String()) } } diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go index cf979748..f0237de9 100644 --- a/internal/pkg/preprocessor/preprocessor.go +++ b/internal/pkg/preprocessor/preprocessor.go @@ -125,10 +125,6 @@ func preprocess(item *models.Item) { logger.Warn("unable to validate URL", "url", item.URL.Raw, "err", err.Error(), "func", "preprocessor.preprocessor") return } - - if config.Get().UseSeencheck { - URLsToSeencheck = append(URLsToSeencheck, item.URL) - } } else if item.GetRedirection() != nil { URLType = "seed" From f25c2d95510d7bc2bf009354ce3734b3420392ee Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Wed, 20 Nov 2024 14:40:12 +0100 Subject: [PATCH 062/295] main: handle second quit signal --- main.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/main.go b/main.go index ea00f82d..be4df081 100644 --- a/main.go +++ b/main.go @@ -99,6 +99,12 @@ func main() { select { case <-signalChan: logger.Info("received shutdown signal, stopping services...") + // Catch a second signal to force exit + go func() { + <-signalChan + logger.Info("received second shutdown signal, forcing exit...") + os.Exit(1) + }() case item := <-seedErrorChan: logger.Error("received error from seedErrorChan", "err", item.GetError()) } From 227c38bfb47022a6ace92bd3d334fda9d81f2727 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Wed, 20 Nov 2024 14:42:28 +0100 Subject: [PATCH 063/295] fix: order of archive tasks --- internal/pkg/archiver/archiver.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/pkg/archiver/archiver.go b/internal/pkg/archiver/archiver.go index 3760f2b1..ece32c17 100644 --- a/internal/pkg/archiver/archiver.go +++ b/internal/pkg/archiver/archiver.go @@ -122,13 +122,13 @@ func archive(item *models.Item) { // Determines the URLs that need to be captured, if the item's status is fresh we need // to capture the seed, else if it's a redirection we need to captue it, and // else we need to capture the child URLs (assets), in parallel - if item.GetStatus() == models.ItemPreProcessed { - URLsToCapture = append(URLsToCapture, item.GetURL()) - } else if item.GetRedirection() != nil { + if item.GetRedirection() != nil { URLsToCapture = append(URLsToCapture, item.GetRedirection()) // We want to nil the redirection field when the capture of the redirection is done, we // will eventually fill it back in postprocess if this capture leads to another redirection defer item.SetRedirection(nil) + } else if item.GetStatus() == models.ItemPreProcessed { + URLsToCapture = append(URLsToCapture, item.GetURL()) } else { URLsToCapture = item.GetChilds() } From cc0d24ecba4ab54866385988e79a38c8100b323a Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Wed, 20 Nov 2024 14:57:38 +0100 Subject: [PATCH 064/295] hq: fixed the graceful stop --- internal/pkg/source/hq/consumer.go | 55 ++++++++++++++++++++++-------- internal/pkg/source/hq/finisher.go | 46 ++++++++++++++++++------- internal/pkg/source/hq/producer.go | 30 +++++++++++++--- 3 files changed, 98 insertions(+), 33 deletions(-) diff --git a/internal/pkg/source/hq/consumer.go b/internal/pkg/source/hq/consumer.go index 74314ba2..6cb9d74a 100644 --- a/internal/pkg/source/hq/consumer.go +++ b/internal/pkg/source/hq/consumer.go @@ -6,12 +6,17 @@ import ( "time" "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/internetarchive/Zeno/internal/pkg/log" "github.com/internetarchive/Zeno/internal/pkg/reactor" "github.com/internetarchive/Zeno/pkg/models" "github.com/internetarchive/gocrawlhq" ) func consumer() { + logger := log.NewFieldedLogger(&log.Fields{ + "component": "hq.consumer", + }) + // Create a context to manage goroutines ctx, cancel := context.WithCancel(globalHQ.ctx) defer cancel() @@ -25,33 +30,48 @@ func consumer() { // WaitGroup to wait for goroutines to finish on shutdown var wg sync.WaitGroup - // Start the fetcher goroutine(s) + // Start the consumerFetcher goroutine(s) wg.Add(1) - go fetcher(ctx, &wg, urlBuffer, batchSize) + go consumerFetcher(ctx, &wg, urlBuffer, batchSize) - // Start the sender goroutine(s) + // Start the consumerSender goroutine(s) wg.Add(1) - go sender(ctx, &wg, urlBuffer) + go consumerSender(ctx, &wg, urlBuffer) // Wait for shutdown signal - <-globalHQ.ctx.Done() + for { + select { + case <-globalHQ.ctx.Done(): + logger.Debug("received done signal") + // Cancel the context to stop all goroutines + cancel() - // Cancel the context to stop all goroutines - cancel() + logger.Debug("waiting for goroutines to finish") + // Wait for all goroutines to finish + wg.Wait() - // Wait for all goroutines to finish - wg.Wait() + // Close the urlBuffer to signal consumerSenders to finish + close(urlBuffer) + + globalHQ.wg.Done() - // Close the urlBuffer to signal senders to finish - close(urlBuffer) + logger.Debug("closed") + return + } + } } -func fetcher(ctx context.Context, wg *sync.WaitGroup, urlBuffer chan<- *gocrawlhq.URL, batchSize int) { +func consumerFetcher(ctx context.Context, wg *sync.WaitGroup, urlBuffer chan<- *gocrawlhq.URL, batchSize int) { + logger := log.NewFieldedLogger(&log.Fields{ + "component": "hq.consumerFetcher", + }) + defer wg.Done() for { // Check for context cancellation select { case <-ctx.Done(): + logger.Debug("closing") return default: } @@ -59,7 +79,7 @@ func fetcher(ctx context.Context, wg *sync.WaitGroup, urlBuffer chan<- *gocrawlh // Fetch URLs from HQ URLs, err := getURLs(batchSize) if err != nil { - logger.Error("error fetching URLs from CrawlHQ", "err", err.Error(), "func", "hq.fetcher") + logger.Error("error fetching URLs from CrawlHQ", "err", err.Error(), "func", "hq.consumerFetcher") time.Sleep(250 * time.Millisecond) continue } @@ -76,15 +96,20 @@ func fetcher(ctx context.Context, wg *sync.WaitGroup, urlBuffer chan<- *gocrawlh } } -func sender(ctx context.Context, wg *sync.WaitGroup, urlBuffer <-chan *gocrawlhq.URL) { +func consumerSender(ctx context.Context, wg *sync.WaitGroup, urlBuffer <-chan *gocrawlhq.URL) { + logger := log.NewFieldedLogger(&log.Fields{ + "component": "hq.consumerSender", + }) + defer wg.Done() for { select { case <-ctx.Done(): + logger.Debug("closing") return case URL, ok := <-urlBuffer: if !ok { - // Channel closed, exit the sender + // Channel closed, exit the consumerSender return } diff --git a/internal/pkg/source/hq/finisher.go b/internal/pkg/source/hq/finisher.go index 06d8419b..626607a4 100644 --- a/internal/pkg/source/hq/finisher.go +++ b/internal/pkg/source/hq/finisher.go @@ -17,6 +17,10 @@ type finishBatch struct { // finisher initializes and starts the finisher and dispatcher processes. func finisher() { + logger := log.NewFieldedLogger(&log.Fields{ + "component": "hq.finisher", + }) + // Create a context to manage goroutines ctx, cancel := context.WithCancel(globalHQ.ctx) defer cancel() @@ -27,27 +31,40 @@ func finisher() { var wg sync.WaitGroup wg.Add(1) - go finishReceiver(ctx, &wg, batchCh) + go finisherReceiver(ctx, &wg, batchCh) wg.Add(1) - go finishDispatcher(ctx, &wg, batchCh) + go finisherDispatcher(ctx, &wg, batchCh) // Wait for the context to be canceled. - <-ctx.Done() + for { + select { + case <-globalHQ.ctx.Done(): + logger.Debug("received done signal") + // Cancel the context to stop all goroutines. + cancel() - // Cancel the context to stop all goroutines. - cancel() + logger.Debug("waiting for goroutines to finish") + // Wait for the finisher and dispatcher to finish. + wg.Wait() - // Wait for the finisher and dispatcher to finish. - wg.Wait() + // Close the batch channel to signal the dispatcher to finish. + close(batchCh) + + globalHQ.wg.Done() + + logger.Debug("closed") + return + } + } } -// finishReceiver reads URLs from finishCh, accumulates them into batches, and sends the batches to batchCh. -func finishReceiver(ctx context.Context, wg *sync.WaitGroup, batchCh chan *finishBatch) { +// finisherReceiver reads URLs from finishCh, accumulates them into batches, and sends the batches to batchCh. +func finisherReceiver(ctx context.Context, wg *sync.WaitGroup, batchCh chan *finishBatch) { defer wg.Done() logger := log.NewFieldedLogger(&log.Fields{ - "component": "hq.finishReceiver", + "component": "hq.finisherReceiver", }) batchSize := getBatchSize() @@ -62,6 +79,7 @@ func finishReceiver(ctx context.Context, wg *sync.WaitGroup, batchCh chan *finis for { select { case <-ctx.Done(): + logger.Debug("closing") // Send any remaining URLs. if len(batch.URLs) > 0 { logger.Debug("while closing sending remaining batch to dispatcher", "size", len(batch.URLs)) @@ -91,12 +109,12 @@ func finishReceiver(ctx context.Context, wg *sync.WaitGroup, batchCh chan *finis } } -// finishDispatcher receives batches from batchCh and dispatches them to sender routines. -func finishDispatcher(ctx context.Context, wg *sync.WaitGroup, batchCh chan *finishBatch) { +// finisherDispatcher receives batches from batchCh and dispatches them to sender routines. +func finisherDispatcher(ctx context.Context, wg *sync.WaitGroup, batchCh chan *finishBatch) { defer wg.Done() logger := log.NewFieldedLogger(&log.Fields{ - "component": "hq.finishDispatcher", + "component": "hq.finisherDispatcher", }) maxSenders := getMaxFinishSenders() @@ -115,6 +133,7 @@ func finishDispatcher(ctx context.Context, wg *sync.WaitGroup, batchCh chan *fin finishSender(ctx, batch) }(batch) case <-ctx.Done(): + logger.Debug("closing") // Wait for all sender routines to finish. senderWg.Wait() return @@ -135,6 +154,7 @@ func finishSender(ctx context.Context, batch *finishBatch) { err := globalHQ.client.Delete(batch.URLs, batch.ChildsCaptured) select { case <-ctx.Done(): + logger.Debug("closing") return default: if err != nil { diff --git a/internal/pkg/source/hq/producer.go b/internal/pkg/source/hq/producer.go index bb3773cc..d45f1c07 100644 --- a/internal/pkg/source/hq/producer.go +++ b/internal/pkg/source/hq/producer.go @@ -17,6 +17,10 @@ type producerBatch struct { // producer initializes and starts the producer and dispatcher processes. func producer() { + logger := log.NewFieldedLogger(&log.Fields{ + "component": "hq.producer", + }) + // Create a context to manage goroutines ctx, cancel := context.WithCancel(globalHQ.ctx) defer cancel() @@ -33,13 +37,26 @@ func producer() { go producerDispatcher(ctx, &wg, batchCh) // Wait for the context to be canceled. - <-ctx.Done() + for { + select { + case <-globalHQ.ctx.Done(): + logger.Debug("received done signal") + // Cancel the context to stop all goroutines. + cancel() - // Cancel the context to stop all goroutines. - cancel() + logger.Debug("waiting for goroutines to finish") + // Wait for the producer and dispatcher to finish. + wg.Wait() - // Wait for the producer and dispatcher to finish. - wg.Wait() + // Close the batch channel to signal the dispatcher to finish. + close(batchCh) + + globalHQ.wg.Done() + + logger.Debug("closed") + return + } + } } // producerReceiver reads URLs from produceCh, accumulates them into batches, and sends the batches to batchCh. @@ -62,6 +79,7 @@ func producerReceiver(ctx context.Context, wg *sync.WaitGroup, batchCh chan *pro for { select { case <-ctx.Done(): + logger.Debug("closing") // Send any remaining URLs. if len(batch.URLs) > 0 { logger.Debug("while closing, sending remaining batch to dispatcher", "size", len(batch.URLs)) @@ -117,6 +135,7 @@ func producerDispatcher(ctx context.Context, wg *sync.WaitGroup, batchCh chan *p producerSender(ctx, batch) }(batch) case <-ctx.Done(): + logger.Debug("closing") // Wait for all sender routines to finish. senderWg.Wait() return @@ -137,6 +156,7 @@ func producerSender(ctx context.Context, batch *producerBatch) { err := globalHQ.client.Add(batch.URLs, false) // Use bypassSeencheck = false select { case <-ctx.Done(): + logger.Debug("closing") return default: if err != nil { From d1df4aeec82c7b5ea53b735d37cfac614d54e25b Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Wed, 20 Nov 2024 15:00:28 +0100 Subject: [PATCH 065/295] fix: preprocessor --- internal/pkg/preprocessor/preprocessor.go | 102 ++++++++-------------- pkg/models/item.go | 4 + pkg/models/url.go | 12 +++ 3 files changed, 53 insertions(+), 65 deletions(-) diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go index f0237de9..cb743feb 100644 --- a/internal/pkg/preprocessor/preprocessor.go +++ b/internal/pkg/preprocessor/preprocessor.go @@ -110,105 +110,77 @@ func preprocess(item *models.Item) { // TODO: if an error happen and it's a fresh item, we should mark it as failed in HQ (if it's a HQ-based crawl) var ( - err error - URLsToSeencheck []*models.URL - URLType string + URLsToPreprocess []*models.URL + URLType models.URLType + err error ) - // Validate the URLs, either the item's URL or its childs if it has any if item.GetStatus() == models.ItemFresh { - URLType = "seed" - - // Validate the item's URL itself - err = normalizeURL(item.URL, nil) - if err != nil { - logger.Warn("unable to validate URL", "url", item.URL.Raw, "err", err.Error(), "func", "preprocessor.preprocessor") - return - } + URLType = models.URLTypeSeed + URLsToPreprocess = append(URLsToPreprocess, item.GetURL()) } else if item.GetRedirection() != nil { - URLType = "seed" + URLType = models.URLTypeRedirection + URLsToPreprocess = append(URLsToPreprocess, item.GetRedirection()) + } else if len(item.Childs) > 0 { + URLType = models.URLTypeAsset + URLsToPreprocess = append(URLsToPreprocess, item.GetChilds()...) + } else { + panic("item has no URL to preprocess") + } - // Validate the item's URL itself - err = normalizeURL(item.GetURL(), nil) - if err != nil { - logger.Warn("unable to validate URL", "url", item.URL.Raw, "err", err.Error(), "func", "preprocessor.preprocessor") - return - } + // Validate the URLs + for i := 0; i < len(URLsToPreprocess); { + var parentURL *models.URL - if config.Get().UseSeencheck { - URLsToSeencheck = append(URLsToSeencheck, item.URL) + if URLType != models.URLTypeSeed { + parentURL = item.GetURL() } - } else if len(item.Childs) > 0 { - URLType = "asset" - // Validate the URLs of the child items - for i := 0; i < len(item.Childs); { - err = normalizeURL(item.Childs[i], item.URL) - if err != nil { - // If we can't validate an URL, we remove it from the list of childs - logger.Warn("unable to validate URL", "url", item.Childs[i].Raw, "err", err.Error(), "func", "preprocessor.preprocessor") - item.Childs = append(item.Childs[:i], item.Childs[i+1:]...) - } else { - if config.Get().UseSeencheck { - URLsToSeencheck = append(URLsToSeencheck, item.Childs[i]) - } - - i++ - } + err = normalizeURL(URLsToPreprocess[i], parentURL) + if err != nil { + // If we can't validate an URL, we remove it from the list of childs + logger.Warn("unable to validate URL", "url", URLsToPreprocess[i].Raw, "err", err.Error(), "func", "preprocessor.preprocess") + URLsToPreprocess = append(URLsToPreprocess[:i], URLsToPreprocess[i+1:]...) + } else { + i++ } - } else { - logger.Error("item got into preprocessoring without anything to preprocessor") } - // If we have URLs to seencheck, we do it - if len(URLsToSeencheck) > 0 { + // If the item is a redirection or an asset, we need to seencheck it if needed + if config.Get().UseSeencheck && URLType != models.URLTypeSeed { var seencheckedURLs []*models.URL if config.Get().HQ { - seencheckedURLs, err = hq.SeencheckURLs(URLType, item.URL) + seencheckedURLs, err = hq.SeencheckURLs(string(URLType), item.URL) if err != nil { logger.Warn("unable to seencheck URL", "url", item.URL.Raw, "err", err.Error(), "func", "preprocessor.preprocess") return } } else { - seencheckedURLs, err = seencheck.SeencheckURLs(URLType, item.URL) + seencheckedURLs, err = seencheck.SeencheckURLs(string(URLType), item.URL) if err != nil { logger.Warn("unable to seencheck URL", "url", item.URL.Raw, "err", err.Error(), "func", "preprocessor.preprocess") return } } - if len(seencheckedURLs) == 0 { - return - } - - if URLType == "seed" { - item.URL = seencheckedURLs[0] - } else { - item.Childs = seencheckedURLs + switch URLType { + case models.URLTypeRedirection: + item.SetRedirection(nil) + case models.URLTypeAsset: + item.SetChilds(seencheckedURLs) } } // Finally, we build the requests, applying any site-specific behavior needed - if URLType == "seed" { + for _, URL := range URLsToPreprocess { // TODO: apply site-specific stuff - req, err := http.NewRequest(http.MethodGet, item.URL.String(), nil) + req, err := http.NewRequest(http.MethodGet, URL.String(), nil) if err != nil { logger.Error("unable to create new request for URL", "url", item.URL.String(), "err", err.Error(), "func", "preprocessor.preprocess") return } - item.URL.SetRequest(req) - } else { - for i, child := range item.Childs { - // TODO: apply site-specific stuff - req, err := http.NewRequest(http.MethodGet, child.String(), nil) - if err != nil { - logger.Error("unable to create new request for URL", "url", item.URL.String(), "err", err.Error(), "func", "preprocessor.preprocess") - return - } - - item.Childs[i].SetRequest(req) - } + URL.SetRequest(req) } } diff --git a/pkg/models/item.go b/pkg/models/item.go index 66cd493d..cf2cf2bb 100644 --- a/pkg/models/item.go +++ b/pkg/models/item.go @@ -77,6 +77,10 @@ func (i *Item) SetSource(source ItemSource) { i.Source = source } +func (i *Item) SetChilds(childs []*URL) { + i.Childs = childs +} + func (i *Item) SetChildsCaptured(captured bool) { i.ChildsCaptured = captured } diff --git a/pkg/models/url.go b/pkg/models/url.go index f21e7e4e..afab4f94 100644 --- a/pkg/models/url.go +++ b/pkg/models/url.go @@ -123,3 +123,15 @@ func encodeQuery(v url.Values) string { } return buf.String() } + +// URLType qualifies the type of URL +type URLType string + +const ( + // URLTypeSeed is for URLs that came from the queue or HQ + URLTypeSeed URLType = "seed" + // URLTypeRedirection is for URLs that are redirections + URLTypeRedirection = "seed" + // URLTypeAsset is for URLs that are assets of a page + URLTypeAsset = "asset" +) From 2e0cd15544e9426a19a2fa6cd5f26d8459597e21 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Wed, 20 Nov 2024 15:06:11 +0100 Subject: [PATCH 066/295] fix: postprocessor --- internal/pkg/postprocessor/postprocessor.go | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/internal/pkg/postprocessor/postprocessor.go b/internal/pkg/postprocessor/postprocessor.go index f21d71d2..eece1ec6 100644 --- a/internal/pkg/postprocessor/postprocessor.go +++ b/internal/pkg/postprocessor/postprocessor.go @@ -104,25 +104,32 @@ func postprocess(item *models.Item) { defer item.SetStatus(models.ItemPostProcessed) // TODO: execute assets redirection + var URL *models.URL + + if item.GetRedirection() != nil { + URL = item.GetRedirection() + } else { + URL = item.GetURL() + } // Verify if there is any redirection - if isStatusCodeRedirect(item.URL.GetResponse().StatusCode) { - logger.Info("detected redirect", "url", item.URL.String()) + if isStatusCodeRedirect(URL.GetResponse().StatusCode) { + logger.Info("detected redirect", "url", URL.String()) // Check if the current redirections count doesn't exceed the max allowed - if item.URL.GetRedirects() >= config.Get().MaxRedirect { + if URL.GetRedirects() >= config.Get().MaxRedirect { logger.Warn("max redirects reached", "item", item.ID) return } // Prepare the new item resulting from the redirection item.SetRedirection(&models.URL{ - Raw: item.URL.GetResponse().Header.Get("Location"), - Redirects: item.URL.GetRedirects() + 1, - Hops: item.URL.GetHops(), + Raw: URL.GetResponse().Header.Get("Location"), + Redirects: URL.GetRedirects() + 1, + Hops: URL.GetHops(), }) return } else { - logger.Info("no redirect", "url", item.URL.String()) + logger.Info("no redirect", "url", URL.String()) } } From 6aa4a80feec90989d4329ca04cc719d6149814a7 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Wed, 20 Nov 2024 15:18:07 +0100 Subject: [PATCH 067/295] fix: redirection --- internal/pkg/archiver/archiver.go | 3 --- internal/pkg/postprocessor/postprocessor.go | 3 +-- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/internal/pkg/archiver/archiver.go b/internal/pkg/archiver/archiver.go index ece32c17..384e4b0b 100644 --- a/internal/pkg/archiver/archiver.go +++ b/internal/pkg/archiver/archiver.go @@ -124,9 +124,6 @@ func archive(item *models.Item) { // else we need to capture the child URLs (assets), in parallel if item.GetRedirection() != nil { URLsToCapture = append(URLsToCapture, item.GetRedirection()) - // We want to nil the redirection field when the capture of the redirection is done, we - // will eventually fill it back in postprocess if this capture leads to another redirection - defer item.SetRedirection(nil) } else if item.GetStatus() == models.ItemPreProcessed { URLsToCapture = append(URLsToCapture, item.GetURL()) } else { diff --git a/internal/pkg/postprocessor/postprocessor.go b/internal/pkg/postprocessor/postprocessor.go index eece1ec6..cf6605fd 100644 --- a/internal/pkg/postprocessor/postprocessor.go +++ b/internal/pkg/postprocessor/postprocessor.go @@ -114,7 +114,6 @@ func postprocess(item *models.Item) { // Verify if there is any redirection if isStatusCodeRedirect(URL.GetResponse().StatusCode) { - logger.Info("detected redirect", "url", URL.String()) // Check if the current redirections count doesn't exceed the max allowed if URL.GetRedirects() >= config.Get().MaxRedirect { logger.Warn("max redirects reached", "item", item.ID) @@ -130,6 +129,6 @@ func postprocess(item *models.Item) { return } else { - logger.Info("no redirect", "url", URL.String()) + item.SetRedirection(nil) } } From 6514f2c805062a8ad25ab541df9d3627194e8f75 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Wed, 20 Nov 2024 16:01:51 +0100 Subject: [PATCH 068/295] hq: add debug logging and dereference the producer/finisher batch before sending it and reseting the pointer --- internal/pkg/log/config.go | 2 +- internal/pkg/source/hq/consumer.go | 6 +++++- internal/pkg/source/hq/finisher.go | 28 +++++++++++++++++++--------- internal/pkg/source/hq/producer.go | 14 ++++++++++---- 4 files changed, 35 insertions(+), 15 deletions(-) diff --git a/internal/pkg/log/config.go b/internal/pkg/log/config.go index 10f29f5c..07172c9e 100644 --- a/internal/pkg/log/config.go +++ b/internal/pkg/log/config.go @@ -39,7 +39,7 @@ type ElasticsearchConfig struct { func defaultConfig() *Config { return &Config{ StdoutEnabled: true, - StdoutLevel: slog.LevelInfo, + StdoutLevel: slog.LevelDebug, StderrEnabled: true, StderrLevel: slog.LevelError, } diff --git a/internal/pkg/source/hq/consumer.go b/internal/pkg/source/hq/consumer.go index 6cb9d74a..5a07f609 100644 --- a/internal/pkg/source/hq/consumer.go +++ b/internal/pkg/source/hq/consumer.go @@ -79,7 +79,11 @@ func consumerFetcher(ctx context.Context, wg *sync.WaitGroup, urlBuffer chan<- * // Fetch URLs from HQ URLs, err := getURLs(batchSize) if err != nil { - logger.Error("error fetching URLs from CrawlHQ", "err", err.Error(), "func", "hq.consumerFetcher") + if err.Error() == "gocrawlhq: feed is empty" { + logger.Debug("feed is empty, waiting for new URLs") + } else { + logger.Error("error fetching URLs from CrawlHQ", "err", err.Error(), "func", "hq.consumerFetcher") + } time.Sleep(250 * time.Millisecond) continue } diff --git a/internal/pkg/source/hq/finisher.go b/internal/pkg/source/hq/finisher.go index 626607a4..c4414af8 100644 --- a/internal/pkg/source/hq/finisher.go +++ b/internal/pkg/source/hq/finisher.go @@ -87,22 +87,30 @@ func finisherReceiver(ctx context.Context, wg *sync.WaitGroup, batchCh chan *fin } return case item := <-globalHQ.finishCh: + logger.Debug("received item", "item", item.ID) URL := gocrawlhq.URL{ - ID: item.ID, + ID: item.ID, + Type: "seed", } batch.URLs = append(batch.URLs, URL) if len(batch.URLs) >= batchSize { logger.Debug("sending batch to dispatcher", "size", len(batch.URLs)) // Send the batch to batchCh. - batchCh <- batch // Blocks if batchCh is full. - batch.URLs = make([]gocrawlhq.URL, 0, batchSize) + copyBatch := *batch + batchCh <- ©Batch // Blocks if batchCh is full. + batch = &finishBatch{ + URLs: make([]gocrawlhq.URL, 0, batchSize), + } resetTimer(timer, maxWaitTime) } case <-timer.C: if len(batch.URLs) > 0 { logger.Debug("sending non-full batch to dispatcher", "size", len(batch.URLs)) - batchCh <- batch // Blocks if batchCh is full. - batch.URLs = make([]gocrawlhq.URL, 0, batchSize) + copyBatch := *batch + batchCh <- ©Batch // Blocks if batchCh is full. + batch = &finishBatch{ + URLs: make([]gocrawlhq.URL, 0, batchSize), + } } resetTimer(timer, maxWaitTime) } @@ -130,7 +138,7 @@ func finisherDispatcher(ctx context.Context, wg *sync.WaitGroup, batchCh chan *f go func(batch *finishBatch) { defer senderWg.Done() defer func() { <-senderSemaphore }() - finishSender(ctx, batch) + finisherSender(ctx, batch) }(batch) case <-ctx.Done(): logger.Debug("closing") @@ -141,15 +149,17 @@ func finisherDispatcher(ctx context.Context, wg *sync.WaitGroup, batchCh chan *f } } -// finishSender sends a batch of URLs to HQ with retries and exponential backoff. -func finishSender(ctx context.Context, batch *finishBatch) { +// finisherSender sends a batch of URLs to HQ with retries and exponential backoff. +func finisherSender(ctx context.Context, batch *finishBatch) { logger := log.NewFieldedLogger(&log.Fields{ - "component": "hq.finishSender", + "component": "hq.finisherSender", }) backoff := time.Second maxBackoff := 5 * time.Second + logger.Debug("sending batch to HQ", "size", len(batch.URLs)) + for { err := globalHQ.client.Delete(batch.URLs, batch.ChildsCaptured) select { diff --git a/internal/pkg/source/hq/producer.go b/internal/pkg/source/hq/producer.go index d45f1c07..ba70d73d 100644 --- a/internal/pkg/source/hq/producer.go +++ b/internal/pkg/source/hq/producer.go @@ -96,15 +96,21 @@ func producerReceiver(ctx context.Context, wg *sync.WaitGroup, batchCh chan *pro if len(batch.URLs) >= batchSize { logger.Debug("sending batch to dispatcher", "size", len(batch.URLs)) // Send the batch to batchCh. - batchCh <- batch // Blocks if batchCh is full. - batch.URLs = make([]gocrawlhq.URL, 0, batchSize) + copyBatch := *batch + batchCh <- ©Batch // Blocks if batchCh is full. + batch = &producerBatch{ + URLs: make([]gocrawlhq.URL, 0, batchSize), + } resetTimer(timer, maxWaitTime) } case <-timer.C: if len(batch.URLs) > 0 { logger.Debug("sending non-full batch to dispatcher", "size", len(batch.URLs)) - batchCh <- batch // Blocks if batchCh is full. - batch.URLs = make([]gocrawlhq.URL, 0, batchSize) + copyBatch := *batch + batchCh <- ©Batch // Blocks if batchCh is full. + batch = &producerBatch{ + URLs: make([]gocrawlhq.URL, 0, batchSize), + } } resetTimer(timer, maxWaitTime) } From b07138027758b793bfec81d9b6427239c32aa3ed Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Wed, 20 Nov 2024 16:17:36 +0100 Subject: [PATCH 069/295] add: HTTP client(s) timeout setting --- internal/pkg/archiver/warc.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/internal/pkg/archiver/warc.go b/internal/pkg/archiver/warc.go index 25ddb3e0..7f93edd4 100644 --- a/internal/pkg/archiver/warc.go +++ b/internal/pkg/archiver/warc.go @@ -2,6 +2,7 @@ package archiver import ( "os" + "time" "github.com/CorentinB/warc" "github.com/internetarchive/Zeno/internal/pkg/config" @@ -76,6 +77,17 @@ func startWARCWriter() { } }() } + + // Set the timeouts + if config.Get().HTTPTimeout > 0 { + if globalArchiver.Client != nil { + globalArchiver.Client.Timeout = time.Duration(config.Get().HTTPTimeout) * time.Second + } + + if globalArchiver.ClientWithProxy != nil { + globalArchiver.ClientWithProxy.Timeout = time.Duration(config.Get().HTTPTimeout) * time.Second + } + } } func GetClients() (clients []*warc.CustomHTTPClient) { From 7bed9fe2afdbed9f8721c71021f757cdc293c2cc Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Wed, 20 Nov 2024 16:20:25 +0100 Subject: [PATCH 070/295] hq: clean package --- internal/pkg/source/hq/hq.go | 59 ++---------------------------------- main.go | 1 + 2 files changed, 4 insertions(+), 56 deletions(-) diff --git a/internal/pkg/source/hq/hq.go b/internal/pkg/source/hq/hq.go index 23155e90..843ee5dc 100644 --- a/internal/pkg/source/hq/hq.go +++ b/internal/pkg/source/hq/hq.go @@ -2,7 +2,6 @@ package hq import ( "context" - "os" "sync" "github.com/internetarchive/Zeno/internal/pkg/config" @@ -29,6 +28,7 @@ var ( func Start(finishChan, produceChan chan *models.Item) error { var done bool + var startErr error log.Start() logger = log.NewFieldedLogger(&log.Fields{ @@ -38,13 +38,11 @@ func Start(finishChan, produceChan chan *models.Item) error { stats.Init() once.Do(func() { - var err error - ctx, cancel := context.WithCancel(context.Background()) HQclient, err := gocrawlhq.Init(config.Get().HQKey, config.Get().HQSecret, config.Get().HQProject, config.Get().HQAddress, "") if err != nil { logger.Error("error initializing crawl HQ client", "err", err.Error(), "func", "hq.Start") - os.Exit(1) + startErr = err } globalHQ = &hq{ @@ -67,7 +65,7 @@ func Start(finishChan, produceChan chan *models.Item) error { return ErrHQAlreadyInitialized } - return nil + return startErr } func Stop() { @@ -78,54 +76,3 @@ func Stop() { logger.Info("stopped") } } - -// func HQFinisher() { -// defer c.HQChannelsWg.Done() - -// var ( -// finishedArray = []gocrawlhq.URL{} -// locallyCrawledTotal int -// ) - -// for finishedItem := range c.HQFinishedChannel { -// if finishedItem.ID == "" { -// c.Log.WithFields(c.genLogFields(nil, finishedItem.URL, nil)).Warn("URL has no ID, discarding") -// continue -// } - -// locallyCrawledTotal += int(finishedItem.LocallyCrawled) -// finishedArray = append(finishedArray, gocrawlhq.URL{ID: finishedItem.ID, Value: utils.URLToString(finishedItem.URL)}) - -// if len(finishedArray) == int(math.Ceil(float64(c.Workers.Count)/2)) { -// for { -// err := c.HQClient.Delete(finishedArray, locallyCrawledTotal) -// if err != nil { -// c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ -// "finishedArray": finishedArray, -// })).Error("error submitting finished urls to crawl HQ. retrying in one second...") -// time.Sleep(time.Second) -// continue -// } -// break -// } - -// finishedArray = []gocrawlhq.URL{} -// locallyCrawledTotal = 0 -// } -// } - -// // send remaining finished URLs -// if len(finishedArray) > 0 { -// for { -// err := c.HQClient.Delete(finishedArray, locallyCrawledTotal) -// if err != nil { -// c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ -// "finishedArray": finishedArray, -// })).Error("error submitting finished urls to crawl HQ. retrying in one second...") -// time.Sleep(time.Second) -// continue -// } -// break -// } -// } -// } diff --git a/main.go b/main.go index be4df081..2092b7e6 100644 --- a/main.go +++ b/main.go @@ -116,4 +116,5 @@ func main() { preprocessor.Stop() reactor.Stop() logger.Info("all services stopped, exiting") + return } From 678cfd3a12b8717cb130c7bffacafb892bee2bb9 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Wed, 20 Nov 2024 16:44:05 +0100 Subject: [PATCH 071/295] global: normalized logging --- internal/pkg/archiver/archiver.go | 9 +++------ internal/pkg/finisher/finisher.go | 19 ++++++++++--------- internal/pkg/log/config.go | 2 +- internal/pkg/postprocessor/postprocessor.go | 7 ++++--- internal/pkg/preprocessor/preprocessor.go | 5 +++-- internal/pkg/preprocessor/url.go | 2 -- internal/pkg/reactor/reactor.go | 4 ++-- internal/pkg/source/hq/finisher.go | 2 +- internal/pkg/source/hq/hq.go | 3 +++ internal/pkg/source/hq/producer.go | 2 ++ pkg/models/item.go | 4 ++++ 11 files changed, 33 insertions(+), 26 deletions(-) diff --git a/internal/pkg/archiver/archiver.go b/internal/pkg/archiver/archiver.go index 384e4b0b..7b3118a2 100644 --- a/internal/pkg/archiver/archiver.go +++ b/internal/pkg/archiver/archiver.go @@ -51,6 +51,7 @@ func Start(inputChan, outputChan, errorChan chan *models.Item) error { outputCh: outputChan, errorCh: errorChan, } + logger.Debug("initialized") // Setup WARC writing HTTP clients startWARCWriter() @@ -89,11 +90,11 @@ func run() { select { // Closes the run routine when context is canceled case <-globalArchiver.ctx.Done(): - logger.Info("shutting down") + logger.Debug("shutting down") return case item, ok := <-globalArchiver.inputCh: if ok { - logger.Info("received item", "item", item.ID) + logger.Debug("received item", "item", item.GetShortID()) guard <- struct{}{} wg.Add(1) stats.ArchiverRoutinesIncr() @@ -161,10 +162,6 @@ func archive(item *models.Item) { // Set the response in the item URL.SetResponse(resp) - if resp.StatusCode != 200 { - logger.Warn("non-200 status code", "status_code", resp.StatusCode) - } - // For now, we only consume it _, err = io.Copy(io.Discard, resp.Body) if err != nil { diff --git a/internal/pkg/finisher/finisher.go b/internal/pkg/finisher/finisher.go index f16e7637..858837a0 100644 --- a/internal/pkg/finisher/finisher.go +++ b/internal/pkg/finisher/finisher.go @@ -78,16 +78,16 @@ func (f *finisher) run() { for { select { case <-f.ctx.Done(): - logger.Info("shutting down") + logger.Debug("shutting down") return case item := <-f.inputCh: if item == nil { panic("received nil item") } - logger.Debug("received item", "item", item.ID) + logger.Debug("received item", "item", item.GetShortID()) if item.Error != nil { - logger.Error("received item with error", "item", item.ID, "err", item.Error) + logger.Error("received item with error", "item", item.GetShortID(), "err", item.Error) f.errorCh <- item continue } @@ -96,37 +96,38 @@ func (f *finisher) run() { logger.Debug("fresh item received", "item", item) f.sourceProducedCh <- item } else if item.GetRedirection() != nil { - logger.Debug("item has redirection", "item", item.ID) + logger.Debug("item has redirection", "item", item.GetShortID()) err := reactor.ReceiveFeedback(item) if err != nil { panic(err) } } else if len(item.GetChilds()) != 0 { - logger.Debug("item has children", "item", item.ID) + logger.Debug("item has children", "item", item.GetShortID()) err := reactor.ReceiveFeedback(item) if err != nil { panic(err) } } else { - logger.Debug("item has no redirection or children", "item", item.ID) + logger.Debug("item has no redirection or children", "item", item.GetShortID()) err := reactor.MarkAsFinished(item) if err != nil { panic(err) } f.sourceFinishedCh <- item + logger.Info("crawled", "url", item.GetURL(), "item", item.GetShortID()) } - logger.Debug("item finished", "item", item.ID) + logger.Debug("item finished", "item", item.GetShortID()) case item := <-f.errorCh: if item == nil { panic("received nil item") } - logger.Info("received item with error", "item", item.ID, "err", item.Error) + logger.Debug("received item with error", "item", item.GetShortID(), "err", item.Error) reactor.MarkAsFinished(item) - logger.Debug("item with error finished", "item", item.ID) + logger.Debug("item with error finished", "item", item.GetShortID()) } } } diff --git a/internal/pkg/log/config.go b/internal/pkg/log/config.go index 07172c9e..10f29f5c 100644 --- a/internal/pkg/log/config.go +++ b/internal/pkg/log/config.go @@ -39,7 +39,7 @@ type ElasticsearchConfig struct { func defaultConfig() *Config { return &Config{ StdoutEnabled: true, - StdoutLevel: slog.LevelDebug, + StdoutLevel: slog.LevelInfo, StderrEnabled: true, StderrLevel: slog.LevelError, } diff --git a/internal/pkg/postprocessor/postprocessor.go b/internal/pkg/postprocessor/postprocessor.go index cf6605fd..35b2f8d4 100644 --- a/internal/pkg/postprocessor/postprocessor.go +++ b/internal/pkg/postprocessor/postprocessor.go @@ -46,6 +46,7 @@ func Start(inputChan, outputChan, errorChan chan *models.Item) error { outputCh: outputChan, errorCh: errorChan, } + logger.Debug("initialized") globalPostprocessor.wg.Add(1) go run() logger.Info("started") @@ -80,11 +81,11 @@ func run() { select { // Closes the run routine when context is canceled case <-globalPostprocessor.ctx.Done(): - logger.Info("shutting down") + logger.Debug("shutting down") return case item, ok := <-globalPostprocessor.inputCh: if ok { - logger.Info("received item", "item", item.ID) + logger.Debug("received item", "item", item.GetShortID()) guard <- struct{}{} wg.Add(1) stats.PostprocessorRoutinesIncr() @@ -116,7 +117,7 @@ func postprocess(item *models.Item) { if isStatusCodeRedirect(URL.GetResponse().StatusCode) { // Check if the current redirections count doesn't exceed the max allowed if URL.GetRedirects() >= config.Get().MaxRedirect { - logger.Warn("max redirects reached", "item", item.ID) + logger.Warn("max redirects reached", "item", item.GetShortID()) return } diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go index cb743feb..eba5463b 100644 --- a/internal/pkg/preprocessor/preprocessor.go +++ b/internal/pkg/preprocessor/preprocessor.go @@ -49,6 +49,7 @@ func Start(inputChan, outputChan, errorChan chan *models.Item) error { outputCh: outputChan, errorCh: errorChan, } + logger.Debug("initialized") globalPreprocessor.wg.Add(1) go run() logger.Info("started") @@ -83,11 +84,11 @@ func run() { select { // Closes the run routine when context is canceled case <-globalPreprocessor.ctx.Done(): - logger.Info("shutting down") + logger.Debug("shutting down") return case item, ok := <-globalPreprocessor.inputCh: if ok { - logger.Info("received item", "item", item.ID) + logger.Debug("received item", "item", item.GetShortID()) guard <- struct{}{} wg.Add(1) stats.PreprocessorRoutinesIncr() diff --git a/internal/pkg/preprocessor/url.go b/internal/pkg/preprocessor/url.go index 0f22face..7beeefb7 100644 --- a/internal/pkg/preprocessor/url.go +++ b/internal/pkg/preprocessor/url.go @@ -1,7 +1,6 @@ package preprocessor import ( - "fmt" "net/url" "github.com/ada-url/goada" @@ -19,7 +18,6 @@ func normalizeURL(URL *models.URL, parentURL *models.URL) (err error) { if err != nil { return err } - fmt.Println(parsedURL.Scheme) if parsedURL.Scheme == "" { parsedURL.Scheme = "http" } diff --git a/internal/pkg/reactor/reactor.go b/internal/pkg/reactor/reactor.go index 767ccb57..18b7ac82 100644 --- a/internal/pkg/reactor/reactor.go +++ b/internal/pkg/reactor/reactor.go @@ -98,7 +98,7 @@ func ReceiveFeedback(item *models.Item) error { // ReceiveInsert sends an item to the input channel consuming a token. // It is the responsibility of the sender to set either ItemSourceQueue or ItemSourceHQ, if not set seed will get forced ItemSourceInsert func ReceiveInsert(item *models.Item) error { - logger.Info("received seed", "seed", item.ID) + logger.Debug("received item", "item", item.GetShortID()) if globalReactor == nil { return ErrReactorNotInitialized } @@ -136,7 +136,7 @@ func (r *reactor) run() { select { // Closes the run routine when context is canceled case <-r.ctx.Done(): - logger.Info("shutting down") + logger.Debug("shutting down") return // Feeds items to the output channel diff --git a/internal/pkg/source/hq/finisher.go b/internal/pkg/source/hq/finisher.go index c4414af8..ebc457a8 100644 --- a/internal/pkg/source/hq/finisher.go +++ b/internal/pkg/source/hq/finisher.go @@ -87,7 +87,7 @@ func finisherReceiver(ctx context.Context, wg *sync.WaitGroup, batchCh chan *fin } return case item := <-globalHQ.finishCh: - logger.Debug("received item", "item", item.ID) + logger.Debug("received item", "item", item.GetShortID()) URL := gocrawlhq.URL{ ID: item.ID, Type: "seed", diff --git a/internal/pkg/source/hq/hq.go b/internal/pkg/source/hq/hq.go index 843ee5dc..6f3568a7 100644 --- a/internal/pkg/source/hq/hq.go +++ b/internal/pkg/source/hq/hq.go @@ -58,6 +58,9 @@ func Start(finishChan, produceChan chan *models.Item) error { go consumer() go producer() go finisher() + + logger.Info("started") + done = true }) diff --git a/internal/pkg/source/hq/producer.go b/internal/pkg/source/hq/producer.go index ba70d73d..f4d71e41 100644 --- a/internal/pkg/source/hq/producer.go +++ b/internal/pkg/source/hq/producer.go @@ -158,6 +158,8 @@ func producerSender(ctx context.Context, batch *producerBatch) { backoff := time.Second maxBackoff := 5 * time.Second + logger.Debug("sending batch to HQ", "size", len(batch.URLs)) + for { err := globalHQ.client.Add(batch.URLs, false) // Use bypassSeencheck = false select { diff --git a/pkg/models/item.go b/pkg/models/item.go index cf2cf2bb..e2af04b0 100644 --- a/pkg/models/item.go +++ b/pkg/models/item.go @@ -41,6 +41,10 @@ func (i *Item) GetID() string { return i.ID } +func (i *Item) GetShortID() string { + return i.ID[:5] +} + func (i *Item) GetURL() *URL { return i.URL } From 96df4cebf1fd0ad3e5fa3fe5fb9432afbd3f88b1 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Wed, 20 Nov 2024 16:54:08 +0100 Subject: [PATCH 072/295] fix: handle failed items --- internal/pkg/postprocessor/postprocessor.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/internal/pkg/postprocessor/postprocessor.go b/internal/pkg/postprocessor/postprocessor.go index 35b2f8d4..9fcf1a9e 100644 --- a/internal/pkg/postprocessor/postprocessor.go +++ b/internal/pkg/postprocessor/postprocessor.go @@ -102,6 +102,11 @@ func run() { } func postprocess(item *models.Item) { + if item.GetStatus() != models.ItemFailed { + item.SetRedirection(nil) + return + } + defer item.SetStatus(models.ItemPostProcessed) // TODO: execute assets redirection From d3920de782c038b7f7c9c02d1523e8cf35724e09 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Wed, 20 Nov 2024 17:12:24 +0100 Subject: [PATCH 073/295] hq: automatically reset seeds ; global: remove seed error chan --- internal/pkg/archiver/archiver.go | 4 +--- internal/pkg/finisher/finisher.go | 19 +------------------ internal/pkg/postprocessor/postprocessor.go | 4 +--- internal/pkg/preprocessor/preprocessor.go | 4 +--- internal/pkg/source/hq/hq.go | 3 +++ main.go | 14 +++++--------- 6 files changed, 12 insertions(+), 36 deletions(-) diff --git a/internal/pkg/archiver/archiver.go b/internal/pkg/archiver/archiver.go index 7b3118a2..5366da84 100644 --- a/internal/pkg/archiver/archiver.go +++ b/internal/pkg/archiver/archiver.go @@ -19,7 +19,6 @@ type archiver struct { cancel context.CancelFunc inputCh chan *models.Item outputCh chan *models.Item - errorCh chan *models.Item Client *warc.CustomHTTPClient ClientWithProxy *warc.CustomHTTPClient @@ -32,7 +31,7 @@ var ( ) // This functions starts the archiver responsible for capturing the URLs -func Start(inputChan, outputChan, errorChan chan *models.Item) error { +func Start(inputChan, outputChan chan *models.Item) error { var done bool log.Start() @@ -49,7 +48,6 @@ func Start(inputChan, outputChan, errorChan chan *models.Item) error { cancel: cancel, inputCh: inputChan, outputCh: outputChan, - errorCh: errorChan, } logger.Debug("initialized") diff --git a/internal/pkg/finisher/finisher.go b/internal/pkg/finisher/finisher.go index 858837a0..94241c9b 100644 --- a/internal/pkg/finisher/finisher.go +++ b/internal/pkg/finisher/finisher.go @@ -13,7 +13,6 @@ type finisher struct { ctx context.Context cancel context.CancelFunc inputCh chan *models.Item - errorCh chan *models.Item sourceFinishedCh chan *models.Item sourceProducedCh chan *models.Item wg sync.WaitGroup @@ -27,7 +26,7 @@ var ( // Start initializes the global finisher with the given input channel. // This method can only be called once. -func Start(inputChan, errorChan, sourceFinishedChan, sourceProducedChan chan *models.Item) error { +func Start(inputChan, sourceFinishedChan, sourceProducedChan chan *models.Item) error { var done bool log.Start() @@ -41,7 +40,6 @@ func Start(inputChan, errorChan, sourceFinishedChan, sourceProducedChan chan *mo ctx: ctx, cancel: cancel, inputCh: inputChan, - errorCh: errorChan, sourceFinishedCh: sourceFinishedChan, sourceProducedCh: sourceProducedChan, wg: sync.WaitGroup{}, @@ -86,11 +84,6 @@ func (f *finisher) run() { } logger.Debug("received item", "item", item.GetShortID()) - if item.Error != nil { - logger.Error("received item with error", "item", item.GetShortID(), "err", item.Error) - f.errorCh <- item - continue - } if item.GetStatus() == models.ItemFresh { logger.Debug("fresh item received", "item", item) @@ -118,16 +111,6 @@ func (f *finisher) run() { } logger.Debug("item finished", "item", item.GetShortID()) - case item := <-f.errorCh: - if item == nil { - panic("received nil item") - } - - logger.Debug("received item with error", "item", item.GetShortID(), "err", item.Error) - - reactor.MarkAsFinished(item) - - logger.Debug("item with error finished", "item", item.GetShortID()) } } } diff --git a/internal/pkg/postprocessor/postprocessor.go b/internal/pkg/postprocessor/postprocessor.go index 9fcf1a9e..a56d8110 100644 --- a/internal/pkg/postprocessor/postprocessor.go +++ b/internal/pkg/postprocessor/postprocessor.go @@ -16,7 +16,6 @@ type postprocessor struct { cancel context.CancelFunc inputCh chan *models.Item outputCh chan *models.Item - errorCh chan *models.Item } var ( @@ -27,7 +26,7 @@ var ( // This functions starts the preprocessor responsible for preparing // the seeds sent by the reactor for captures -func Start(inputChan, outputChan, errorChan chan *models.Item) error { +func Start(inputChan, outputChan chan *models.Item) error { var done bool log.Start() @@ -44,7 +43,6 @@ func Start(inputChan, outputChan, errorChan chan *models.Item) error { cancel: cancel, inputCh: inputChan, outputCh: outputChan, - errorCh: errorChan, } logger.Debug("initialized") globalPostprocessor.wg.Add(1) diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go index eba5463b..e1687e26 100644 --- a/internal/pkg/preprocessor/preprocessor.go +++ b/internal/pkg/preprocessor/preprocessor.go @@ -19,7 +19,6 @@ type preprocessor struct { cancel context.CancelFunc inputCh chan *models.Item outputCh chan *models.Item - errorCh chan *models.Item } var ( @@ -30,7 +29,7 @@ var ( // This functions starts the preprocessor responsible for preparing // the seeds sent by the reactor for captures -func Start(inputChan, outputChan, errorChan chan *models.Item) error { +func Start(inputChan, outputChan chan *models.Item) error { var done bool log.Start() @@ -47,7 +46,6 @@ func Start(inputChan, outputChan, errorChan chan *models.Item) error { cancel: cancel, inputCh: inputChan, outputCh: outputChan, - errorCh: errorChan, } logger.Debug("initialized") globalPreprocessor.wg.Add(1) diff --git a/internal/pkg/source/hq/hq.go b/internal/pkg/source/hq/hq.go index 6f3568a7..f5c94674 100644 --- a/internal/pkg/source/hq/hq.go +++ b/internal/pkg/source/hq/hq.go @@ -75,6 +75,9 @@ func Stop() { if globalHQ != nil { globalHQ.cancel() globalHQ.wg.Wait() + if err := globalHQ.client.Reset(); err != nil { + logger.Error("error while reseting", "err", err) + } once = sync.Once{} logger.Info("stopped") } diff --git a/main.go b/main.go index 2092b7e6..771fd298 100644 --- a/main.go +++ b/main.go @@ -51,28 +51,26 @@ func main() { } } - seedErrorChan := make(chan *models.Item) - // Start the reactor that will receive reactorOutputChan := make(chan *models.Item) err := reactor.Start(config.Get().WorkersCount, reactorOutputChan) preprocessorOutputChan := make(chan *models.Item) - err = preprocessor.Start(reactorOutputChan, preprocessorOutputChan, seedErrorChan) + err = preprocessor.Start(reactorOutputChan, preprocessorOutputChan) if err != nil { logger.Error("error starting preprocessor", "err", err.Error()) return } archiverOutputChan := make(chan *models.Item) - err = archiver.Start(preprocessorOutputChan, archiverOutputChan, seedErrorChan) + err = archiver.Start(preprocessorOutputChan, archiverOutputChan) if err != nil { logger.Error("error starting archiver", "err", err.Error()) return } postprocessorOutputChan := make(chan *models.Item) - err = postprocessor.Start(archiverOutputChan, postprocessorOutputChan, seedErrorChan) + err = postprocessor.Start(archiverOutputChan, postprocessorOutputChan) if err != nil { logger.Error("error starting postprocessor", "err", err.Error()) return @@ -86,7 +84,7 @@ func main() { return } - err = finisher.Start(postprocessorOutputChan, seedErrorChan, hqFinishChan, hqProduceChan) + err = finisher.Start(postprocessorOutputChan, hqFinishChan, hqProduceChan) if err != nil { logger.Error("error starting finisher", "err", err.Error()) return @@ -105,15 +103,13 @@ func main() { logger.Info("received second shutdown signal, forcing exit...") os.Exit(1) }() - case item := <-seedErrorChan: - logger.Error("received error from seedErrorChan", "err", item.GetError()) } finisher.Stop() - hq.Stop() postprocessor.Stop() archiver.Stop() preprocessor.Stop() + hq.Stop() reactor.Stop() logger.Info("all services stopped, exiting") return From 8fdbf2c13567b86ab9d05d29a5d027adf89b6707 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Wed, 20 Nov 2024 18:32:05 +0100 Subject: [PATCH 074/295] global: enhanced concurrency and management of routines ; reactor: added a freeze context used to refrain other routines to interact with it --- go.mod | 2 +- go.sum | 4 +- internal/pkg/archiver/archiver.go | 31 +++++++++---- internal/pkg/postprocessor/postprocessor.go | 31 +++++++++---- internal/pkg/preprocessor/preprocessor.go | 30 +++++++++---- internal/pkg/reactor/error.go | 2 + internal/pkg/reactor/reactor.go | 50 ++++++++++++++------- internal/pkg/source/hq/consumer.go | 21 +++++---- internal/pkg/source/hq/finisher.go | 32 ++++++------- internal/pkg/source/hq/producer.go | 25 +++++++---- main.go | 9 ++++ 11 files changed, 157 insertions(+), 80 deletions(-) diff --git a/go.mod b/go.mod index f873fe26..5d85f02b 100644 --- a/go.mod +++ b/go.mod @@ -8,7 +8,7 @@ require ( github.com/elastic/go-elasticsearch v0.0.0 github.com/elastic/go-elasticsearch/v7 v7.17.10 github.com/google/uuid v1.6.0 - github.com/internetarchive/gocrawlhq v1.2.20 + github.com/internetarchive/gocrawlhq v1.2.21 github.com/philippgille/gokv/leveldb v0.7.0 github.com/spf13/cobra v1.8.1 github.com/spf13/pflag v1.0.5 diff --git a/go.sum b/go.sum index 95aa41fe..068f9026 100644 --- a/go.sum +++ b/go.sum @@ -44,8 +44,8 @@ github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= -github.com/internetarchive/gocrawlhq v1.2.20 h1:0mIIt9lhPacKr6L2JeISoopQ8EgzC3dISJ3ITGGbOp4= -github.com/internetarchive/gocrawlhq v1.2.20/go.mod h1:gHrdMewIi5OBWE/xEZGqSrNHyTXPbt+h+XUWpp9fZek= +github.com/internetarchive/gocrawlhq v1.2.21 h1:/3KbioYTvX4HIH4ZgDehp2a6k9cijTz7rBjTLYtlSIk= +github.com/internetarchive/gocrawlhq v1.2.21/go.mod h1:gHrdMewIi5OBWE/xEZGqSrNHyTXPbt+h+XUWpp9fZek= github.com/klauspost/compress v1.17.10 h1:oXAz+Vh0PMUvJczoi+flxpnBEPxoER1IaAnU/NMPtT0= github.com/klauspost/compress v1.17.10/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= diff --git a/internal/pkg/archiver/archiver.go b/internal/pkg/archiver/archiver.go index 5366da84..534553f8 100644 --- a/internal/pkg/archiver/archiver.go +++ b/internal/pkg/archiver/archiver.go @@ -71,24 +71,33 @@ func Stop() { if globalArchiver != nil { globalArchiver.cancel() globalArchiver.wg.Wait() - close(globalArchiver.outputCh) logger.Info("stopped") } } func run() { + logger := log.NewFieldedLogger(&log.Fields{ + "component": "archiver.run", + }) + defer globalArchiver.wg.Done() - var ( - wg sync.WaitGroup - guard = make(chan struct{}, config.Get().WorkersCount) - ) + // Create a context to manage goroutines + ctx, cancel := context.WithCancel(globalArchiver.ctx) + defer cancel() + + // Create a wait group to wait for all goroutines to finish + var wg sync.WaitGroup + + // Guard to limit the number of concurrent archiver routines + guard := make(chan struct{}, config.Get().WorkersCount) for { select { // Closes the run routine when context is canceled case <-globalArchiver.ctx.Done(): logger.Debug("shutting down") + wg.Wait() return case item, ok := <-globalArchiver.inputCh: if ok { @@ -96,13 +105,19 @@ func run() { guard <- struct{}{} wg.Add(1) stats.ArchiverRoutinesIncr() - go func() { + go func(ctx context.Context) { defer wg.Done() defer func() { <-guard }() defer stats.ArchiverRoutinesDecr() + archive(item) - globalArchiver.outputCh <- item - }() + + select { + case <-ctx.Done(): + return + case globalArchiver.outputCh <- item: + } + }(ctx) } } } diff --git a/internal/pkg/postprocessor/postprocessor.go b/internal/pkg/postprocessor/postprocessor.go index a56d8110..bd969c49 100644 --- a/internal/pkg/postprocessor/postprocessor.go +++ b/internal/pkg/postprocessor/postprocessor.go @@ -62,24 +62,33 @@ func Stop() { if globalPostprocessor != nil { globalPostprocessor.cancel() globalPostprocessor.wg.Wait() - close(globalPostprocessor.outputCh) logger.Info("stopped") } } func run() { + logger := log.NewFieldedLogger(&log.Fields{ + "component": "postprocessor.run", + }) + defer globalPostprocessor.wg.Done() - var ( - wg sync.WaitGroup - guard = make(chan struct{}, config.Get().WorkersCount) - ) + // Create a context to manage goroutines + ctx, cancel := context.WithCancel(globalPostprocessor.ctx) + defer cancel() + + // Create a wait group to wait for all goroutines to finish + var wg sync.WaitGroup + + // Guard to limit the number of concurrent archiver routines + guard := make(chan struct{}, config.Get().WorkersCount) for { select { // Closes the run routine when context is canceled case <-globalPostprocessor.ctx.Done(): logger.Debug("shutting down") + wg.Wait() return case item, ok := <-globalPostprocessor.inputCh: if ok { @@ -87,13 +96,19 @@ func run() { guard <- struct{}{} wg.Add(1) stats.PostprocessorRoutinesIncr() - go func() { + go func(ctx context.Context) { defer wg.Done() defer func() { <-guard }() defer stats.PostprocessorRoutinesDecr() + postprocess(item) - globalPostprocessor.outputCh <- item - }() + + select { + case <-ctx.Done(): + return + case globalPostprocessor.outputCh <- item: + } + }(ctx) } } } diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go index e1687e26..0458c471 100644 --- a/internal/pkg/preprocessor/preprocessor.go +++ b/internal/pkg/preprocessor/preprocessor.go @@ -65,18 +65,26 @@ func Stop() { if globalPreprocessor != nil { globalPreprocessor.cancel() globalPreprocessor.wg.Wait() - close(globalPreprocessor.outputCh) logger.Info("stopped") } } func run() { + logger := log.NewFieldedLogger(&log.Fields{ + "component": "preprocessor.run", + }) + defer globalPreprocessor.wg.Done() - var ( - wg sync.WaitGroup - guard = make(chan struct{}, config.Get().WorkersCount) - ) + // Create a context to manage goroutines + ctx, cancel := context.WithCancel(globalPreprocessor.ctx) + defer cancel() + + // Create a wait group to wait for all goroutines to finish + var wg sync.WaitGroup + + // Guard to limit the number of concurrent archiver routines + guard := make(chan struct{}, config.Get().WorkersCount) for { select { @@ -90,13 +98,19 @@ func run() { guard <- struct{}{} wg.Add(1) stats.PreprocessorRoutinesIncr() - go func() { + go func(ctx context.Context) { defer wg.Done() defer func() { <-guard }() defer stats.PreprocessorRoutinesDecr() + preprocess(item) - globalPreprocessor.outputCh <- item - }() + + select { + case <-ctx.Done(): + return + case globalPreprocessor.outputCh <- item: + } + }(ctx) } } } diff --git a/internal/pkg/reactor/error.go b/internal/pkg/reactor/error.go index e51abae1..deb42701 100644 --- a/internal/pkg/reactor/error.go +++ b/internal/pkg/reactor/error.go @@ -9,6 +9,8 @@ var ( ErrReactorNotInitialized = errors.New("reactor not initialized") // ErrReactorShuttingDown is the error returned when the reactor is shutting down ErrReactorShuttingDown = errors.New("reactor shutting down") + // ErrReactorFrozen is the error returned when the reactor is frozen + ErrReactorFrozen = errors.New("reactor frozen") // ErrFeedbackItemNotPresent is the error returned when an item was sent to the feedback channel but not found in the state table ErrFeedbackItemNotPresent = errors.New("feedback item not present in state table") diff --git a/internal/pkg/reactor/reactor.go b/internal/pkg/reactor/reactor.go index 18b7ac82..2f13ad21 100644 --- a/internal/pkg/reactor/reactor.go +++ b/internal/pkg/reactor/reactor.go @@ -11,13 +11,15 @@ import ( // reactor struct holds the state and channels for managing seeds processing. type reactor struct { - tokenPool chan struct{} // Token pool to control asset count - ctx context.Context // Context for stopping the reactor - cancel context.CancelFunc // Context's cancel func - input chan *models.Item // Combined input channel for source and feedback - output chan *models.Item // Output channel - stateTable sync.Map // State table for tracking seeds by UUID - wg sync.WaitGroup // WaitGroup to manage goroutines + tokenPool chan struct{} // Token pool to control asset count + ctx context.Context // Context for stopping the reactor + cancel context.CancelFunc // Context's cancel func + freezeCtx context.Context // Context for freezing the reactor + freezeCancel context.CancelFunc // Freezing context's cancel func + input chan *models.Item // Combined input channel for source and feedback + output chan *models.Item // Output channel + stateTable sync.Map // State table for tracking seeds by UUID + wg sync.WaitGroup // WaitGroup to manage goroutines // stopChan chan struct{} // Channel to signal when stop is finished } @@ -39,12 +41,15 @@ func Start(maxTokens int, outputChan chan *models.Item) error { once.Do(func() { ctx, cancel := context.WithCancel(context.Background()) + freezeCtx, freezeCancel := context.WithCancel(ctx) globalReactor = &reactor{ - tokenPool: make(chan struct{}, maxTokens), - ctx: ctx, - cancel: cancel, - input: make(chan *models.Item, maxTokens), - output: outputChan, + tokenPool: make(chan struct{}, maxTokens), + ctx: ctx, + cancel: cancel, + freezeCtx: freezeCtx, + freezeCancel: freezeCancel, + input: make(chan *models.Item, maxTokens), + output: outputChan, } logger.Debug("initialized") globalReactor.wg.Add(1) @@ -67,13 +72,20 @@ func Stop() { globalReactor.cancel() globalReactor.wg.Wait() close(globalReactor.input) - close(globalReactor.tokenPool) once = sync.Once{} globalReactor = nil logger.Info("stopped") } } +// Freeze stops the global reactor from processing seeds. +func Freeze() { + if globalReactor != nil { + logger.Debug("received freeze signal") + globalReactor.freezeCancel() + } +} + // ReceiveFeedback sends an item to the feedback channel. // If the item is not present on the state table it gets discarded func ReceiveFeedback(item *models.Item) error { @@ -88,10 +100,12 @@ func ReceiveFeedback(item *models.Item) error { return ErrFeedbackItemNotPresent } select { - case globalReactor.input <- item: - return nil case <-globalReactor.ctx.Done(): return ErrReactorShuttingDown + case <-globalReactor.freezeCtx.Done(): + return ErrReactorFrozen + case globalReactor.input <- item: + return nil } } @@ -104,6 +118,10 @@ func ReceiveInsert(item *models.Item) error { } select { + case <-globalReactor.ctx.Done(): + return ErrReactorShuttingDown + case <-globalReactor.freezeCtx.Done(): + return ErrReactorFrozen case globalReactor.tokenPool <- struct{}{}: if item.Source != models.ItemSourceQueue && item.Source != models.ItemSourceHQ { item.Source = models.ItemSourceInsert @@ -111,8 +129,6 @@ func ReceiveInsert(item *models.Item) error { globalReactor.stateTable.Store(item.ID, item) globalReactor.input <- item return nil - case <-globalReactor.ctx.Done(): - return ErrReactorShuttingDown } } diff --git a/internal/pkg/source/hq/consumer.go b/internal/pkg/source/hq/consumer.go index 5a07f609..a300502a 100644 --- a/internal/pkg/source/hq/consumer.go +++ b/internal/pkg/source/hq/consumer.go @@ -43,16 +43,13 @@ func consumer() { select { case <-globalHQ.ctx.Done(): logger.Debug("received done signal") - // Cancel the context to stop all goroutines - cancel() - logger.Debug("waiting for goroutines to finish") - // Wait for all goroutines to finish - wg.Wait() - // Close the urlBuffer to signal consumerSenders to finish close(urlBuffer) + // Wait for all goroutines to finish + wg.Wait() + globalHQ.wg.Done() logger.Debug("closed") @@ -62,11 +59,12 @@ func consumer() { } func consumerFetcher(ctx context.Context, wg *sync.WaitGroup, urlBuffer chan<- *gocrawlhq.URL, batchSize int) { + defer wg.Done() + logger := log.NewFieldedLogger(&log.Fields{ "component": "hq.consumerFetcher", }) - defer wg.Done() for { // Check for context cancellation select { @@ -92,20 +90,21 @@ func consumerFetcher(ctx context.Context, wg *sync.WaitGroup, urlBuffer chan<- * for _, URL := range URLs { select { case <-ctx.Done(): + logger.Debug("closing") return case urlBuffer <- &URL: - } } } } func consumerSender(ctx context.Context, wg *sync.WaitGroup, urlBuffer <-chan *gocrawlhq.URL) { + defer wg.Done() + logger := log.NewFieldedLogger(&log.Fields{ "component": "hq.consumerSender", }) - defer wg.Done() for { select { case <-ctx.Done(): @@ -113,13 +112,13 @@ func consumerSender(ctx context.Context, wg *sync.WaitGroup, urlBuffer <-chan *g return case URL, ok := <-urlBuffer: if !ok { - // Channel closed, exit the consumerSender + logger.Debug("closing") return } // Process the URL and send to reactor err := processAndSend(URL) - if err != nil { + if err != nil && err != reactor.ErrReactorFrozen { panic(err) } } diff --git a/internal/pkg/source/hq/finisher.go b/internal/pkg/source/hq/finisher.go index ebc457a8..5fd236ef 100644 --- a/internal/pkg/source/hq/finisher.go +++ b/internal/pkg/source/hq/finisher.go @@ -41,16 +41,14 @@ func finisher() { select { case <-globalHQ.ctx.Done(): logger.Debug("received done signal") - // Cancel the context to stop all goroutines. - cancel() - logger.Debug("waiting for goroutines to finish") - // Wait for the finisher and dispatcher to finish. - wg.Wait() // Close the batch channel to signal the dispatcher to finish. close(batchCh) + // Wait for the finisher and dispatcher to finish. + wg.Wait() + globalHQ.wg.Done() logger.Debug("closed") @@ -80,11 +78,6 @@ func finisherReceiver(ctx context.Context, wg *sync.WaitGroup, batchCh chan *fin select { case <-ctx.Done(): logger.Debug("closing") - // Send any remaining URLs. - if len(batch.URLs) > 0 { - logger.Debug("while closing sending remaining batch to dispatcher", "size", len(batch.URLs)) - batchCh <- batch // Blocks if batchCh is full. - } return case item := <-globalHQ.finishCh: logger.Debug("received item", "item", item.GetShortID()) @@ -131,7 +124,19 @@ func finisherDispatcher(ctx context.Context, wg *sync.WaitGroup, batchCh chan *f for { select { - case batch := <-batchCh: + case <-ctx.Done(): + logger.Debug("closing") + // Wait for all sender routines to finish. + senderWg.Wait() + return + case batch, ok := <-batchCh: + if !ok { + logger.Debug("closing") + // Wait for all sender routines to finish. + senderWg.Wait() + return + } + senderSemaphore <- struct{}{} // Blocks if maxSenders reached. senderWg.Add(1) logger.Debug("dispatching batch to sender", "size", len(batch.URLs)) @@ -140,11 +145,6 @@ func finisherDispatcher(ctx context.Context, wg *sync.WaitGroup, batchCh chan *f defer func() { <-senderSemaphore }() finisherSender(ctx, batch) }(batch) - case <-ctx.Done(): - logger.Debug("closing") - // Wait for all sender routines to finish. - senderWg.Wait() - return } } } diff --git a/internal/pkg/source/hq/producer.go b/internal/pkg/source/hq/producer.go index f4d71e41..00b577f8 100644 --- a/internal/pkg/source/hq/producer.go +++ b/internal/pkg/source/hq/producer.go @@ -127,24 +127,31 @@ func producerDispatcher(ctx context.Context, wg *sync.WaitGroup, batchCh chan *p maxSenders := getMaxProducerSenders() senderSemaphore := make(chan struct{}, maxSenders) - var senderWg sync.WaitGroup + var producerWg sync.WaitGroup for { select { - case batch := <-batchCh: + case <-ctx.Done(): + logger.Debug("closing") + // Wait for all sender routines to finish. + producerWg.Wait() + return + case batch, ok := <-batchCh: + if !ok { + logger.Debug("closing") + // Wait for all sender routines to finish. + producerWg.Wait() + return + } + senderSemaphore <- struct{}{} // Blocks if maxSenders reached. - senderWg.Add(1) + producerWg.Add(1) logger.Debug("dispatching batch to sender", "size", len(batch.URLs)) go func(batch *producerBatch) { - defer senderWg.Done() + defer producerWg.Done() defer func() { <-senderSemaphore }() producerSender(ctx, batch) }(batch) - case <-ctx.Done(): - logger.Debug("closing") - // Wait for all sender routines to finish. - senderWg.Wait() - return } } } diff --git a/main.go b/main.go index 771fd298..f8db8264 100644 --- a/main.go +++ b/main.go @@ -109,8 +109,17 @@ func main() { postprocessor.Stop() archiver.Stop() preprocessor.Stop() + reactor.Freeze() hq.Stop() reactor.Stop() + + close(reactorOutputChan) + close(preprocessorOutputChan) + close(archiverOutputChan) + close(postprocessorOutputChan) + close(hqFinishChan) + close(hqProduceChan) + logger.Info("all services stopped, exiting") return } From ba0049bf2765ef66675c6fd1a496b16fb4bd5db6 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Wed, 20 Nov 2024 19:48:55 +0100 Subject: [PATCH 075/295] stats: increase stats where needed --- go.mod | 7 +++ go.sum | 48 ++++++++++++++++ internal/pkg/archiver/archiver.go | 2 + internal/pkg/finisher/finisher.go | 2 + internal/pkg/stats/ui.go | 91 +++++++++++++++++++++++++++++++ main.go | 3 + 6 files changed, 153 insertions(+) create mode 100644 internal/pkg/stats/ui.go diff --git a/go.mod b/go.mod index 5d85f02b..93b57582 100644 --- a/go.mod +++ b/go.mod @@ -21,6 +21,8 @@ require ( github.com/andybalholm/brotli v1.1.0 // indirect github.com/cloudflare/circl v1.4.0 // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect + github.com/gdamore/encoding v1.0.0 // indirect + github.com/gdamore/tcell/v2 v2.7.1 // indirect github.com/gobwas/httphead v0.1.0 // indirect github.com/gobwas/pool v0.2.1 // indirect github.com/gobwas/ws v1.4.0 // indirect @@ -28,7 +30,9 @@ require ( github.com/hashicorp/hcl v1.0.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/klauspost/compress v1.17.10 // indirect + github.com/lucasb-eyer/go-colorful v1.2.0 // indirect github.com/magiconair/properties v1.8.7 // indirect + github.com/mattn/go-runewidth v0.0.15 // indirect github.com/miekg/dns v1.1.62 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect github.com/paulbellamy/ratecounter v0.2.0 // indirect @@ -36,6 +40,8 @@ require ( github.com/philippgille/gokv/encoding v0.7.0 // indirect github.com/philippgille/gokv/util v0.7.0 // indirect github.com/refraction-networking/utls v1.6.7 // indirect + github.com/rivo/tview v0.0.0-20241103174730-c76f7879f592 // indirect + github.com/rivo/uniseg v0.4.7 // indirect github.com/sagikazarmark/locafero v0.4.0 // indirect github.com/sagikazarmark/slog-shim v0.1.0 // indirect github.com/sourcegraph/conc v0.3.0 // indirect @@ -51,6 +57,7 @@ require ( golang.org/x/mod v0.21.0 // indirect golang.org/x/sync v0.9.0 // indirect golang.org/x/sys v0.27.0 // indirect + golang.org/x/term v0.26.0 // indirect golang.org/x/text v0.20.0 // indirect golang.org/x/tools v0.25.0 // indirect gopkg.in/ini.v1 v1.67.0 // indirect diff --git a/go.sum b/go.sum index 068f9026..30f1ceba 100644 --- a/go.sum +++ b/go.sum @@ -22,6 +22,10 @@ github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7z github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= +github.com/gdamore/encoding v1.0.0 h1:+7OoQ1Bc6eTm5niUzBa0Ctsh6JbMW6Ra+YNuAtDBdko= +github.com/gdamore/encoding v1.0.0/go.mod h1:alR0ol34c49FCSBLjhosxzcPHQbf2trDkoo5dl+VrEg= +github.com/gdamore/tcell/v2 v2.7.1 h1:TiCcmpWHiAU7F0rA2I3S2Y4mmLmO9KHxJ7E1QhYzQbc= +github.com/gdamore/tcell/v2 v2.7.1/go.mod h1:dSXtXTSK0VsW1biw65DZLZ2NKr7j0qP/0J7ONmsraWg= github.com/go-test/deep v1.1.0 h1:WOcxcdHcvdgThNXjw0t76K42FXTU7HpNQWHpA2HHNlg= github.com/go-test/deep v1.1.0/go.mod h1:5C2ZWiW0ErCdrYzpqxLbTX7MG14M9iiw8DgHncVwcsE= github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU= @@ -52,8 +56,12 @@ github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY= +github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0= github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= +github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZgg3U= +github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= github.com/miekg/dns v1.1.62 h1:cN8OuEF1/x5Rq6Np+h1epln8OiyPWV+lROx9LxcGgIQ= github.com/miekg/dns v1.1.62/go.mod h1:mvDlcItzm+br7MToIKqkglaGhlFMHJ9DTNNWONWXbNQ= github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= @@ -82,6 +90,12 @@ github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRI github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/refraction-networking/utls v1.6.7 h1:zVJ7sP1dJx/WtVuITug3qYUq034cDq9B2MR1K67ULZM= github.com/refraction-networking/utls v1.6.7/go.mod h1:BC3O4vQzye5hqpmDTWUqi4P5DDhzJfkV1tdqtawQIH0= +github.com/rivo/tview v0.0.0-20241103174730-c76f7879f592 h1:YIJ+B1hePP6AgynC5TcqpO0H9k3SSoZa2BGyL6vDUzM= +github.com/rivo/tview v0.0.0-20241103174730-c76f7879f592/go.mod h1:02iFIz7K/A9jGCvrizLPvoqr4cEIx7q54RH5Qudkrss= +github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= +github.com/rivo/uniseg v0.4.3/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= +github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= +github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8= github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= @@ -117,33 +131,67 @@ github.com/syndtr/goleveldb v1.0.0 h1:fBdIW9lB4Iz0n9khmH8w27SJ3QEJ7+IgjPEwGSZiFd github.com/syndtr/goleveldb v1.0.0/go.mod h1:ZVVdQEZoIme9iO1Ch2Jdy24qqXrMMOU6lpPAyBWyWuQ= github.com/ulikunitz/xz v0.5.12 h1:37Nm15o69RwBkXM0J6A5OlE67RZTfzUxTj8fB3dfcsc= github.com/ulikunitz/xz v0.5.12/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= go.uber.org/atomic v1.9.0 h1:ECmE8Bn/WFTYwEW/bpKD3M8VtR/zQVbavAoalC1PYyE= go.uber.org/atomic v1.9.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.9.0 h1:7fIwc/ZtS0q++VgcfqFDxSBZVv/Xo49/SYnDFupUwlI= go.uber.org/multierr v1.9.0/go.mod h1:X2jQV1h+kxSjClGpnseKVIxpmcjrj7MNnI0bnlfKTVQ= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.29.0 h1:L5SG1JTTXupVV3n6sUqMTeWbjAyfPwoda2DLX8J8FrQ= golang.org/x/crypto v0.29.0/go.mod h1:+F4F4N5hv6v38hfeYwTdx20oUvLLc+QfrE9Ax9HtgRg= golang.org/x/exp v0.0.0-20230905200255-921286631fa9 h1:GoHiUyI/Tp2nVkLI2mCxVkOjsbSXD66ic0XW0js0R9g= golang.org/x/exp v0.0.0-20230905200255-921286631fa9/go.mod h1:S2oDrQGGwySpoQPVqRShND87VCbxmc6bL1Yd2oYrm6k= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.21.0 h1:vvrHzRwRfVKSiLrG+d4FMl/Qi4ukBCE6kZlTUkDYRT0= golang.org/x/mod v0.21.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.31.0 h1:68CPQngjLL0r2AlUKiSxtQFKvzRVbnzLwMUn5SzcLHo= golang.org/x/net v0.31.0/go.mod h1:P4fl1q7dY2hnZFxEk4pPSkDHF+QqjitcnDjUQyMM+pM= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.9.0 h1:fEo0HyrW1GIgZdpbhCRO0PkJajUS5H9IFUztCgEo2jQ= golang.org/x/sync v0.9.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s= golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= +golang.org/x/term v0.26.0 h1:WEQa6V3Gja/BhNxg540hBip/kkaYtRg3cxg4oXSw4AU= +golang.org/x/term v0.26.0/go.mod h1:Si5m1o57C5nBNQo5z1iq+XDijt21BDBDp2bK0QI8e3E= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug= golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/tools v0.25.0 h1:oFU9pkj/iJgs+0DT+VMHrx+oBKs/LJMV+Uvg78sl+fE= golang.org/x/tools v0.25.0/go.mod h1:/vtpO8WL1N9cQC3FN5zPqb//fRXskFHbLKk4OW1Q7rg= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/internal/pkg/archiver/archiver.go b/internal/pkg/archiver/archiver.go index 534553f8..6e2abe42 100644 --- a/internal/pkg/archiver/archiver.go +++ b/internal/pkg/archiver/archiver.go @@ -172,6 +172,8 @@ func archive(item *models.Item) { return } + stats.URLsCrawledIncr() + // Set the response in the item URL.SetResponse(resp) diff --git a/internal/pkg/finisher/finisher.go b/internal/pkg/finisher/finisher.go index 94241c9b..7a89099a 100644 --- a/internal/pkg/finisher/finisher.go +++ b/internal/pkg/finisher/finisher.go @@ -6,6 +6,7 @@ import ( "github.com/internetarchive/Zeno/internal/pkg/log" "github.com/internetarchive/Zeno/internal/pkg/reactor" + "github.com/internetarchive/Zeno/internal/pkg/stats" "github.com/internetarchive/Zeno/pkg/models" ) @@ -108,6 +109,7 @@ func (f *finisher) run() { } f.sourceFinishedCh <- item logger.Info("crawled", "url", item.GetURL(), "item", item.GetShortID()) + stats.SeedsFinishedIncr() } logger.Debug("item finished", "item", item.GetShortID()) diff --git a/internal/pkg/stats/ui.go b/internal/pkg/stats/ui.go new file mode 100644 index 00000000..4632cfd0 --- /dev/null +++ b/internal/pkg/stats/ui.go @@ -0,0 +1,91 @@ +package stats + +import ( + "context" + "fmt" + "sync" + "time" + + "github.com/rivo/tview" +) + +var ( + ctx context.Context + cancel context.CancelFunc +) + +func StartUI() { + // Create a context to manage goroutines + ctx, cancel = context.WithCancel(context.Background()) + defer cancel() + + var wg sync.WaitGroup + + wg.Add(1) + go ui(ctx, &wg) + + // Wait for the context to be canceled. + for { + select { + case <-ctx.Done(): + cancel() + wg.Wait() + return + } + } +} + +func StopUI() { + cancel() +} + +func ui(ctx context.Context, wg *sync.WaitGroup) { + defer wg.Done() + + // Create a new application + app := tview.NewApplication() + + // Create text views for each stat + urlsCrawledText := tview.NewTextView().SetDynamicColors(true) + seedsFinishedText := tview.NewTextView().SetDynamicColors(true) + preprocessorRoutinesText := tview.NewTextView().SetDynamicColors(true) + archiverRoutinesText := tview.NewTextView().SetDynamicColors(true) + postprocessorRoutinesText := tview.NewTextView().SetDynamicColors(true) + + // Create a flex layout to hold the text views + flex := tview.NewFlex(). + SetDirection(tview.FlexRow). + AddItem(urlsCrawledText, 0, 1, false). + AddItem(seedsFinishedText, 0, 1, false). + AddItem(preprocessorRoutinesText, 0, 1, false). + AddItem(archiverRoutinesText, 0, 1, false). + AddItem(postprocessorRoutinesText, 0, 1, false) + + // Function to update the stats + go func(ctx context.Context) { + for { + select { + case <-ctx.Done(): + app.Stop() + return + default: + // Sleep for a while before updating again + time.Sleep(250 * time.Millisecond) + + // Update text views + urlsCrawledText.SetText(fmt.Sprintf("URLs Crawled: %d", URLsCrawledGet())) + seedsFinishedText.SetText(fmt.Sprintf("Seeds Finished: %d", SeedsFinishedGet())) + preprocessorRoutinesText.SetText(fmt.Sprintf("Preprocessor Routines: %d", PreprocessorRoutinesGet())) + archiverRoutinesText.SetText(fmt.Sprintf("Archiver Routines: %d", ArchiverRoutinesGet())) + postprocessorRoutinesText.SetText(fmt.Sprintf("Postprocessor Routines: %d", PostprocessorRoutinesGet())) + + // Refresh the UI + app.Draw() + } + } + }(ctx) + + if err := app.SetRoot(flex, true).Run(); err != nil { + panic(err) + } +} diff --git a/main.go b/main.go index f8db8264..d58855a7 100644 --- a/main.go +++ b/main.go @@ -23,6 +23,7 @@ import ( "github.com/internetarchive/Zeno/internal/pkg/preprocessor/seencheck" "github.com/internetarchive/Zeno/internal/pkg/reactor" "github.com/internetarchive/Zeno/internal/pkg/source/hq" + "github.com/internetarchive/Zeno/internal/pkg/stats" "github.com/internetarchive/Zeno/pkg/models" ) @@ -42,6 +43,8 @@ func main() { return } + stats.Init() + // If needed, start the seencheck process if config.Get().UseSeencheck { err := seencheck.Start(config.Get().JobPath) From 1eb26e48a4580193caf2749b28f88b4a30448c12 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Thu, 21 Nov 2024 10:17:36 +0100 Subject: [PATCH 076/295] archiver: wait & close WARC writers --- internal/pkg/archiver/archiver.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/internal/pkg/archiver/archiver.go b/internal/pkg/archiver/archiver.go index 6e2abe42..5f921709 100644 --- a/internal/pkg/archiver/archiver.go +++ b/internal/pkg/archiver/archiver.go @@ -71,6 +71,15 @@ func Stop() { if globalArchiver != nil { globalArchiver.cancel() globalArchiver.wg.Wait() + + // Wait for the WARC writing to finish + globalArchiver.Client.WaitGroup.Wait() + globalArchiver.Client.Close() + if globalArchiver.ClientWithProxy != nil { + globalArchiver.ClientWithProxy.WaitGroup.Wait() + globalArchiver.ClientWithProxy.Close() + } + logger.Info("stopped") } } From 805c33744a4d1524baff20cbf22d83fde29aa741 Mon Sep 17 00:00:00 2001 From: Will Howes Date: Thu, 21 Nov 2024 10:41:25 +0100 Subject: [PATCH 077/295] turn String into a wrapper function for URLToString --- internal/pkg/preprocessor/url.go | 2 +- pkg/models/url.go | 24 ++++++++++++++---------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/internal/pkg/preprocessor/url.go b/internal/pkg/preprocessor/url.go index 7beeefb7..8f2344e3 100644 --- a/internal/pkg/preprocessor/url.go +++ b/internal/pkg/preprocessor/url.go @@ -21,7 +21,7 @@ func normalizeURL(URL *models.URL, parentURL *models.URL) (err error) { if parsedURL.Scheme == "" { parsedURL.Scheme = "http" } - ada_result, err = goada.New(parsedURL.String()) + ada_result, err = goada.New(models.URLToString(parsedURL)) if err != nil { return err } diff --git a/pkg/models/url.go b/pkg/models/url.go index afab4f94..64ce65d7 100644 --- a/pkg/models/url.go +++ b/pkg/models/url.go @@ -60,29 +60,33 @@ func (u *URL) GetHops() int { return u.Hops } -// String exists to apply some custom stuff, in opposition of simply -// using the u.parsed.String() method func (u *URL) String() string { + return URLToString(u.parsed) +} + +// URLToString exists to apply some custom stuff, in opposition of simply +// using the u.parsed.String() method +func URLToString(URL *url.URL) string { var err error - switch u.parsed.Host { + switch URL.Host { case "external-preview.redd.it", "styles.redditmedia.com", "preview.redd.it": // Do nothing. We don't want to encode the URL for signature purposes. :( break default: - q := u.parsed.Query() - u.parsed.RawQuery = encodeQuery(q) + q := URL.Query() + URL.RawQuery = encodeQuery(q) } - u.parsed.Host, err = idna.ToASCII(u.parsed.Host) + URL.Host, err = idna.ToASCII(URL.Host) if err != nil { - if strings.Contains(u.parsed.Host, ":") { - hostWithoutPort, port, err := net.SplitHostPort(u.parsed.Host) + if strings.Contains(URL.Host, ":") { + hostWithoutPort, port, err := net.SplitHostPort(URL.Host) if err != nil { slog.Warn("cannot split host and port", "error", err) } else { asciiHost, err := idna.ToASCII(hostWithoutPort) if err == nil { - u.parsed.Host = asciiHost + ":" + port + URL.Host = asciiHost + ":" + port } else { slog.Warn("cannot encode punycode host without port to ASCII", "error", err) } @@ -92,7 +96,7 @@ func (u *URL) String() string { } } - return u.parsed.String() + return URL.String() } // Encode encodes the values into “URL encoded” form From 0c8d65507a4598116b2f5311712214854fe73a36 Mon Sep 17 00:00:00 2001 From: Will Howes Date: Thu, 21 Nov 2024 10:42:58 +0100 Subject: [PATCH 078/295] simple variable renaming --- internal/pkg/preprocessor/url.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/internal/pkg/preprocessor/url.go b/internal/pkg/preprocessor/url.go index 8f2344e3..bf8d2c7e 100644 --- a/internal/pkg/preprocessor/url.go +++ b/internal/pkg/preprocessor/url.go @@ -12,7 +12,7 @@ func normalizeURL(URL *models.URL, parentURL *models.URL) (err error) { // and converting relative URLs into absolute URLs. An error is returned if the URL // cannot be normalized. - var ada_result *goada.Url + var adaParse *goada.Url if parentURL == nil { parsedURL, err := url.Parse(URL.Raw) if err != nil { @@ -21,17 +21,17 @@ func normalizeURL(URL *models.URL, parentURL *models.URL) (err error) { if parsedURL.Scheme == "" { parsedURL.Scheme = "http" } - ada_result, err = goada.New(models.URLToString(parsedURL)) + adaParse, err = goada.New(models.URLToString(parsedURL)) if err != nil { return err } } else { - ada_result, err = goada.NewWithBase(URL.Raw, parentURL.Raw) + adaParse, err = goada.NewWithBase(URL.Raw, parentURL.Raw) if err != nil { return err } } - ada_result.SetHash("") - URL.Raw = ada_result.Href() + adaParse.SetHash("") + URL.Raw = adaParse.Href() return URL.Parse() } From ad24996a8e4e2198ffc5426a4c255fe307b0e79e Mon Sep 17 00:00:00 2001 From: Will Howes Date: Thu, 21 Nov 2024 11:34:38 +0100 Subject: [PATCH 079/295] improve URL scheme handling, update TestNormalizeURL --- internal/pkg/preprocessor/error.go | 2 ++ internal/pkg/preprocessor/url.go | 17 ++++++++++++----- internal/pkg/preprocessor/url_test.go | 6 ++++++ 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/internal/pkg/preprocessor/error.go b/internal/pkg/preprocessor/error.go index ca62b09c..c30d83f3 100644 --- a/internal/pkg/preprocessor/error.go +++ b/internal/pkg/preprocessor/error.go @@ -5,4 +5,6 @@ import "errors" var ( // ErrPreprocessorAlreadyInitialized is the error returned when the preprocessor is already initialized ErrPreprocessorAlreadyInitialized = errors.New("preprocessor already initialized") + //ErrSchemeIsInvalid is the error returned when the scheme of a URL is not http or http + ErrUnsupportedScheme = errors.New("URL scheme is unsupported") ) diff --git a/internal/pkg/preprocessor/url.go b/internal/pkg/preprocessor/url.go index bf8d2c7e..735b4c58 100644 --- a/internal/pkg/preprocessor/url.go +++ b/internal/pkg/preprocessor/url.go @@ -7,31 +7,38 @@ import ( "github.com/internetarchive/Zeno/pkg/models" ) +// Normalize the URL by removing fragments, attempting to add URL scheme if missing, +// and converting relative URLs into absolute URLs. An error is returned if the URL +// cannot be normalized. func normalizeURL(URL *models.URL, parentURL *models.URL) (err error) { - // Normalize the URL by removing fragments, attempting to add URL scheme if missing, - // and converting relative URLs into absolute URLs. An error is returned if the URL - // cannot be normalized. - var adaParse *goada.Url + if parentURL == nil { parsedURL, err := url.Parse(URL.Raw) if err != nil { return err } + if parsedURL.Scheme == "" { parsedURL.Scheme = "http" } + adaParse, err = goada.New(models.URLToString(parsedURL)) if err != nil { return err } } else { - adaParse, err = goada.NewWithBase(URL.Raw, parentURL.Raw) + adaParse, err = goada.NewWithBase(URL.Raw, parentURL.String()) if err != nil { return err } } + adaParse.SetHash("") + if scheme := adaParse.Protocol(); scheme != "http:" && scheme != "https:" { + return ErrUnsupportedScheme + } URL.Raw = adaParse.Href() + return URL.Parse() } diff --git a/internal/pkg/preprocessor/url_test.go b/internal/pkg/preprocessor/url_test.go index 991fa8bc..12c80cde 100644 --- a/internal/pkg/preprocessor/url_test.go +++ b/internal/pkg/preprocessor/url_test.go @@ -38,6 +38,11 @@ func TestNormalizeURL(t *testing.T) { wantErr: false, expectedURL: "http://www.google.com/", }, + { + name: "FTP url", + rawURL: "ftp://ftp.example.com", + wantErr: true, + }, { name: "valid URL with path without scheme", rawURL: "www.google.com/dogs", @@ -53,6 +58,7 @@ func TestNormalizeURL(t *testing.T) { var parentURL *models.URL if tt.parentURL != "" { parentURL = &models.URL{Raw: tt.parentURL} + parentURL.Parse() } err := normalizeURL(url, parentURL) if (err != nil) != tt.wantErr { From da22cf7a0ff8719a283dee948c5a21e428114e59 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Fri, 22 Nov 2024 17:34:07 +0100 Subject: [PATCH 080/295] add: get url command --- cmd/get_hq.go | 2 +- cmd/get_url.go | 5 +++ internal/pkg/config/config.go | 4 ++- internal/pkg/finisher/finisher.go | 8 ++++- internal/pkg/preprocessor/preprocessor.go | 2 +- main.go | 44 ++++++++++++++++++----- 6 files changed, 52 insertions(+), 13 deletions(-) diff --git a/cmd/get_hq.go b/cmd/get_hq.go index d179853b..567d4bfe 100644 --- a/cmd/get_hq.go +++ b/cmd/get_hq.go @@ -15,7 +15,7 @@ var getHQCmd = &cobra.Command{ return fmt.Errorf("viper config is nil") } - cfg.HQ = true + cfg.UseHQ = true return nil }, diff --git a/cmd/get_url.go b/cmd/get_url.go index 2f74f769..e2a300bb 100644 --- a/cmd/get_url.go +++ b/cmd/get_url.go @@ -15,9 +15,14 @@ var getURLCmd = &cobra.Command{ if cfg == nil { return fmt.Errorf("viper config is nil") } + return nil }, RunE: func(cmd *cobra.Command, args []string) error { + for _, URL := range args { + config.Get().InputSeeds = append(config.Get().InputSeeds, URL) + } + return config.GenerateCrawlConfig() }, } diff --git a/internal/pkg/config/config.go b/internal/pkg/config/config.go index 8867bdd1..091fb211 100644 --- a/internal/pkg/config/config.go +++ b/internal/pkg/config/config.go @@ -80,7 +80,7 @@ type Config struct { DisableLocalDedupe bool `mapstructure:"disable-local-dedupe"` CertValidation bool `mapstructure:"cert-validation"` DisableAssetsCapture bool `mapstructure:"disable-assets-capture"` - HQ bool // Special field to check if HQ is enabled depending on the command called + UseHQ bool // Special field to check if HQ is enabled depending on the command called HQRateLimitSendBack bool `mapstructure:"hq-rate-limiting-send-back"` NoStdoutLogging bool `mapstructure:"no-stdout-log"` NoBatchWriteWAL bool `mapstructure:"ultrasafe-queue"` @@ -97,6 +97,8 @@ type Config struct { // Dependencies NoYTDLP bool `mapstructure:"no-ytdlp"` YTDLPPath string `mapstructure:"ytdlp-path"` + + InputSeeds []string // Special field to store the input URLs } var ( diff --git a/internal/pkg/finisher/finisher.go b/internal/pkg/finisher/finisher.go index 7a89099a..be2c8dc4 100644 --- a/internal/pkg/finisher/finisher.go +++ b/internal/pkg/finisher/finisher.go @@ -107,7 +107,13 @@ func (f *finisher) run() { if err != nil { panic(err) } - f.sourceFinishedCh <- item + + // Notify the source that the item has been finished + // E.g.: to delete the item in Crawl HQ + if f.sourceFinishedCh != nil { + f.sourceFinishedCh <- item + } + logger.Info("crawled", "url", item.GetURL(), "item", item.GetShortID()) stats.SeedsFinishedIncr() } diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go index 0458c471..193cfc5f 100644 --- a/internal/pkg/preprocessor/preprocessor.go +++ b/internal/pkg/preprocessor/preprocessor.go @@ -163,7 +163,7 @@ func preprocess(item *models.Item) { if config.Get().UseSeencheck && URLType != models.URLTypeSeed { var seencheckedURLs []*models.URL - if config.Get().HQ { + if config.Get().UseHQ { seencheckedURLs, err = hq.SeencheckURLs(string(URLType), item.URL) if err != nil { logger.Warn("unable to seencheck URL", "url", item.URL.Raw, "err", err.Error(), "func", "preprocessor.preprocess") diff --git a/main.go b/main.go index d58855a7..ba3c6604 100644 --- a/main.go +++ b/main.go @@ -79,20 +79,40 @@ func main() { return } - hqFinishChan := make(chan *models.Item) - hqProduceChan := make(chan *models.Item) - err = hq.Start(hqFinishChan, hqProduceChan) - if err != nil { - logger.Error("error starting hq", "err", err.Error()) - return + var finisherFinishChan, finisherProduceChan chan *models.Item + if config.Get().UseHQ { + logger.Info("starting hq") + + finisherFinishChan = make(chan *models.Item) + finisherProduceChan = make(chan *models.Item) + + err = hq.Start(finisherFinishChan, finisherProduceChan) + if err != nil { + logger.Error("error starting hq", "err", err.Error()) + return + } } - err = finisher.Start(postprocessorOutputChan, hqFinishChan, hqProduceChan) + err = finisher.Start(postprocessorOutputChan, finisherFinishChan, finisherProduceChan) if err != nil { logger.Error("error starting finisher", "err", err.Error()) return } + // Pipe in the reactor the input seeds if any + if len(config.Get().InputSeeds) > 0 { + for _, seed := range config.Get().InputSeeds { + item := models.NewItem(models.ItemSourceQueue) + item.SetURL(&models.URL{Raw: seed}) + + err = reactor.ReceiveInsert(item) + if err != nil { + logger.Error("unable to insert seed", "err", err.Error()) + return + } + } + } + // Handle OS signals for graceful shutdown signalChan := make(chan os.Signal, 1) signal.Notify(signalChan, syscall.SIGINT, syscall.SIGTERM) @@ -120,8 +140,14 @@ func main() { close(preprocessorOutputChan) close(archiverOutputChan) close(postprocessorOutputChan) - close(hqFinishChan) - close(hqProduceChan) + + if finisherFinishChan != nil { + close(finisherFinishChan) + } + + if finisherProduceChan != nil { + close(finisherProduceChan) + } logger.Info("all services stopped, exiting") return From 64f17323c135b68e6ed61805397720bc506ac70b Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Fri, 22 Nov 2024 17:34:36 +0100 Subject: [PATCH 081/295] chore: upgrade warc lib --- go.mod | 4 ++-- go.sum | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/go.mod b/go.mod index 93b57582..507df8be 100644 --- a/go.mod +++ b/go.mod @@ -3,13 +3,14 @@ module github.com/internetarchive/Zeno go 1.23.3 require ( - github.com/CorentinB/warc v0.8.53 + github.com/CorentinB/warc v0.8.54 github.com/ada-url/goada v0.0.0-20240402045241-5e45a5777313 github.com/elastic/go-elasticsearch v0.0.0 github.com/elastic/go-elasticsearch/v7 v7.17.10 github.com/google/uuid v1.6.0 github.com/internetarchive/gocrawlhq v1.2.21 github.com/philippgille/gokv/leveldb v0.7.0 + github.com/rivo/tview v0.0.0-20241103174730-c76f7879f592 github.com/spf13/cobra v1.8.1 github.com/spf13/pflag v1.0.5 github.com/spf13/viper v1.19.0 @@ -40,7 +41,6 @@ require ( github.com/philippgille/gokv/encoding v0.7.0 // indirect github.com/philippgille/gokv/util v0.7.0 // indirect github.com/refraction-networking/utls v1.6.7 // indirect - github.com/rivo/tview v0.0.0-20241103174730-c76f7879f592 // indirect github.com/rivo/uniseg v0.4.7 // indirect github.com/sagikazarmark/locafero v0.4.0 // indirect github.com/sagikazarmark/slog-shim v0.1.0 // indirect diff --git a/go.sum b/go.sum index 30f1ceba..709aaa27 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,7 @@ github.com/CorentinB/warc v0.8.53 h1:xVz3RMdZ6faAqTtLfcK1/yl8ZTansy+B2en//EZLUlM= github.com/CorentinB/warc v0.8.53/go.mod h1:NblONkMtoBB4TIigew6F6vakzu0z3YQTKNFS8U2FIn8= +github.com/CorentinB/warc v0.8.54 h1:Y3iK5FEF+m9pKftWf6PXYRC0sYLbwph9j//d9DEwy9g= +github.com/CorentinB/warc v0.8.54/go.mod h1:NblONkMtoBB4TIigew6F6vakzu0z3YQTKNFS8U2FIn8= github.com/ada-url/goada v0.0.0-20240402045241-5e45a5777313 h1:jdPBTZ3nZwBBZzz5SCFUMcTxoZr8t9ogwdvD3P27f/E= github.com/ada-url/goada v0.0.0-20240402045241-5e45a5777313/go.mod h1:+D/veNwI2mA1hDYLVrYSobYcLFWm6e3DJ/H/d/dxlu8= github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= From 825b39e3d006a5bd810ff8a7d018cce3b8230960 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Fri, 22 Nov 2024 18:29:51 +0100 Subject: [PATCH 082/295] add: assets extraction --- .old/internal/pkg/crawl/assets.go | 327 +++++++++--------- go.mod | 4 + go.sum | 12 + internal/pkg/archiver/archiver.go | 18 +- internal/pkg/postprocessor/assets.go | 294 ++++++++++++++++ internal/pkg/postprocessor/extractor/json.go | 64 ++++ .../pkg/postprocessor/extractor/json_test.go | 83 +++++ internal/pkg/postprocessor/extractor/m3u8.go | 62 ++++ internal/pkg/postprocessor/extractor/s3.go | 124 +++++++ internal/pkg/postprocessor/extractor/utils.go | 49 +++ internal/pkg/postprocessor/extractor/xml.go | 66 ++++ .../pkg/postprocessor/extractor/xml_test.go | 181 ++++++++++ .../extractor/xml_test_sitemap.xml | 2 + internal/pkg/postprocessor/postprocessor.go | 43 ++- internal/pkg/preprocessor/preprocessor.go | 6 +- pkg/models/item.go | 6 +- 16 files changed, 1152 insertions(+), 189 deletions(-) create mode 100644 internal/pkg/postprocessor/assets.go create mode 100644 internal/pkg/postprocessor/extractor/json.go create mode 100644 internal/pkg/postprocessor/extractor/json_test.go create mode 100644 internal/pkg/postprocessor/extractor/m3u8.go create mode 100644 internal/pkg/postprocessor/extractor/s3.go create mode 100644 internal/pkg/postprocessor/extractor/utils.go create mode 100644 internal/pkg/postprocessor/extractor/xml.go create mode 100644 internal/pkg/postprocessor/extractor/xml_test.go create mode 100644 internal/pkg/postprocessor/extractor/xml_test_sitemap.xml diff --git a/.old/internal/pkg/crawl/assets.go b/.old/internal/pkg/crawl/assets.go index 9aaa90eb..754602f4 100644 --- a/.old/internal/pkg/crawl/assets.go +++ b/.old/internal/pkg/crawl/assets.go @@ -1,183 +1,178 @@ package crawl import ( - "io" - "net/http" "net/url" - "regexp" "strconv" "strings" - "sync/atomic" "github.com/PuerkitoBio/goquery" "github.com/internetarchive/Zeno/internal/pkg/crawl/extractor" "github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream" "github.com/internetarchive/Zeno/internal/pkg/queue" "github.com/internetarchive/Zeno/internal/pkg/utils" - "github.com/remeh/sizedwaitgroup" ) -var backgroundImageRegex = regexp.MustCompile(`(?:\(['"]?)(.*?)(?:['"]?\))`) -var urlRegex = regexp.MustCompile(`(?m)url\((.*?)\)`) - -func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie, headers map[string]string) error { - var resp *http.Response - - // Prepare GET request - req, err := http.NewRequest("GET", utils.URLToString(item.URL), nil) - if err != nil { - return err - } - - req.Header.Set("Referer", utils.URLToString(item.ParentURL)) - req.Header.Set("User-Agent", c.UserAgent) - - // If headers are passed, apply them to the request - if headers != nil { - for key, value := range headers { - req.Header.Set(key, value) - } - } - - // Apply cookies obtained from the original URL captured - for i := range cookies { - req.AddCookie(cookies[i]) - } - - resp, err = c.executeGET(item, req, false) - if err != nil && err.Error() == "URL from redirection has already been seen" { - return nil - } else if err != nil { - return err - } - defer resp.Body.Close() - - if extractor.IsM3U8(resp) { - assets, err := extractor.M3U8(resp) - if err == nil { - assets = c.seencheckAssets(assets, item) - if len(assets) != 0 { - c.captureAssets(item, assets, cookies, headers) - } - } else { - c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from M3U8") - } - } - - io.Copy(io.Discard, resp.Body) - - return nil -} - -func (c *Crawl) captureAssets(item *queue.Item, assets []*url.URL, cookies []*http.Cookie, headers map[string]string) { - // TODO: implement a counter for the number of assets - // currently being processed - // c.Frontier.QueueCount.Incr(int64(len(assets))) - swg := sizedwaitgroup.New(int(c.MaxConcurrentAssets)) - excluded := false - - for _, asset := range assets { - // TODO: implement a counter for the number of assets - // currently being processed - // c.Frontier.QueueCount.Incr(-1) - - // Just making sure we do not over archive by archiving the original URL - if utils.URLToString(item.URL) == utils.URLToString(asset) { - continue - } - - // If the URL match any excluded string, we ignore it - for _, excludedString := range c.ExcludedStrings { - if strings.Contains(utils.URLToString(asset), excludedString) { - excluded = true - break - } - } - - if excluded { - excluded = false - continue - } - - swg.Add() - c.URIsPerSecond.Incr(1) - - go func(asset *url.URL, swg *sizedwaitgroup.SizedWaitGroup) { - defer swg.Done() - - // Create the asset's item - newAsset, err := queue.NewItem(asset, item.URL, "asset", item.Hop, "", false) - if err != nil { - c.Log.WithFields(c.genLogFields(err, asset, map[string]interface{}{ - "parentHop": item.Hop, - "parentUrl": utils.URLToString(item.URL), - "type": "asset", - })).Error("error while creating asset item") - return - } - - // Capture the asset - err = c.captureAsset(newAsset, cookies, headers) - if err != nil { - c.Log.WithFields(c.genLogFields(err, &asset, map[string]interface{}{ - "parentHop": item.Hop, - "parentUrl": utils.URLToString(item.URL), - "type": "asset", - })).Error("error while capturing asset") - return - } - - // If we made it to this point, it means that the asset have been crawled successfully, - // then we can increment the locallyCrawled variable - atomic.AddUint64(&item.LocallyCrawled, 1) - }(asset, &swg) - } - - swg.Wait() -} - -func (c *Crawl) seencheckAssets(assets []*url.URL, item *queue.Item) []*url.URL { - if c.UseSeencheck { - if c.UseHQ { - seencheckedURLs, err := c.HQSeencheckURLs(assets) - // We ignore the error here because we don't want to slow down the crawl - // if HQ is down or if the request failed. So if we get an error, we just - // continue with the original list of assets. - if err != nil { - c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ - "urls": assets, - "parentHop": item.Hop, - "parentUrl": utils.URLToString(item.URL), - })).Error("error while seenchecking assets via HQ") - } else { - assets = seencheckedURLs - } - - if len(assets) == 0 { - return []*url.URL{} - } - } else { - seencheckedBatch := []*url.URL{} - - for _, URL := range assets { - found := c.Seencheck.SeencheckURL(utils.URLToString(URL), "asset") - if found { - continue - } - - seencheckedBatch = append(seencheckedBatch, URL) - } - - if len(seencheckedBatch) == 0 { - return []*url.URL{} - } - - assets = seencheckedBatch - } - } - - return assets -} +// var backgroundImageRegex = regexp.MustCompile(`(?:\(['"]?)(.*?)(?:['"]?\))`) +// var urlRegex = regexp.MustCompile(`(?m)url\((.*?)\)`) + +// func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie, headers map[string]string) error { +// var resp *http.Response + +// // Prepare GET request +// req, err := http.NewRequest("GET", utils.URLToString(item.URL), nil) +// if err != nil { +// return err +// } + +// req.Header.Set("Referer", utils.URLToString(item.ParentURL)) +// req.Header.Set("User-Agent", c.UserAgent) + +// // If headers are passed, apply them to the request +// if headers != nil { +// for key, value := range headers { +// req.Header.Set(key, value) +// } +// } + +// // Apply cookies obtained from the original URL captured +// for i := range cookies { +// req.AddCookie(cookies[i]) +// } + +// resp, err = c.executeGET(item, req, false) +// if err != nil && err.Error() == "URL from redirection has already been seen" { +// return nil +// } else if err != nil { +// return err +// } +// defer resp.Body.Close() + +// if extractor.IsM3U8(resp) { +// assets, err := extractor.M3U8(resp) +// if err == nil { +// assets = c.seencheckAssets(assets, item) +// if len(assets) != 0 { +// c.captureAssets(item, assets, cookies, headers) +// } +// } else { +// c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from M3U8") +// } +// } + +// io.Copy(io.Discard, resp.Body) + +// return nil +// } + +// func (c *Crawl) captureAssets(item *queue.Item, assets []*url.URL, cookies []*http.Cookie, headers map[string]string) { +// // TODO: implement a counter for the number of assets +// // currently being processed +// // c.Frontier.QueueCount.Incr(int64(len(assets))) +// swg := sizedwaitgroup.New(int(c.MaxConcurrentAssets)) +// excluded := false + +// for _, asset := range assets { +// // TODO: implement a counter for the number of assets +// // currently being processed +// // c.Frontier.QueueCount.Incr(-1) + +// // Just making sure we do not over archive by archiving the original URL +// if utils.URLToString(item.URL) == utils.URLToString(asset) { +// continue +// } + +// // If the URL match any excluded string, we ignore it +// for _, excludedString := range c.ExcludedStrings { +// if strings.Contains(utils.URLToString(asset), excludedString) { +// excluded = true +// break +// } +// } + +// if excluded { +// excluded = false +// continue +// } + +// swg.Add() +// c.URIsPerSecond.Incr(1) + +// go func(asset *url.URL, swg *sizedwaitgroup.SizedWaitGroup) { +// defer swg.Done() + +// // Create the asset's item +// newAsset, err := queue.NewItem(asset, item.URL, "asset", item.Hop, "", false) +// if err != nil { +// c.Log.WithFields(c.genLogFields(err, asset, map[string]interface{}{ +// "parentHop": item.Hop, +// "parentUrl": utils.URLToString(item.URL), +// "type": "asset", +// })).Error("error while creating asset item") +// return +// } + +// // Capture the asset +// err = c.captureAsset(newAsset, cookies, headers) +// if err != nil { +// c.Log.WithFields(c.genLogFields(err, &asset, map[string]interface{}{ +// "parentHop": item.Hop, +// "parentUrl": utils.URLToString(item.URL), +// "type": "asset", +// })).Error("error while capturing asset") +// return +// } + +// // If we made it to this point, it means that the asset have been crawled successfully, +// // then we can increment the locallyCrawled variable +// atomic.AddUint64(&item.LocallyCrawled, 1) +// }(asset, &swg) +// } + +// swg.Wait() +// } + +// func (c *Crawl) seencheckAssets(assets []*url.URL, item *queue.Item) []*url.URL { +// if c.UseSeencheck { +// if c.UseHQ { +// seencheckedURLs, err := c.HQSeencheckURLs(assets) +// // We ignore the error here because we don't want to slow down the crawl +// // if HQ is down or if the request failed. So if we get an error, we just +// // continue with the original list of assets. +// if err != nil { +// c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ +// "urls": assets, +// "parentHop": item.Hop, +// "parentUrl": utils.URLToString(item.URL), +// })).Error("error while seenchecking assets via HQ") +// } else { +// assets = seencheckedURLs +// } + +// if len(assets) == 0 { +// return []*url.URL{} +// } +// } else { +// seencheckedBatch := []*url.URL{} + +// for _, URL := range assets { +// found := c.Seencheck.SeencheckURL(utils.URLToString(URL), "asset") +// if found { +// continue +// } + +// seencheckedBatch = append(seencheckedBatch, URL) +// } + +// if len(seencheckedBatch) == 0 { +// return []*url.URL{} +// } + +// assets = seencheckedBatch +// } +// } + +// return assets +// } func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Document) (assets []*url.URL, err error) { var rawAssets []string diff --git a/go.mod b/go.mod index 507df8be..8cb3c2b9 100644 --- a/go.mod +++ b/go.mod @@ -19,7 +19,9 @@ require ( ) require ( + github.com/PuerkitoBio/goquery v1.10.0 // indirect github.com/andybalholm/brotli v1.1.0 // indirect + github.com/andybalholm/cascadia v1.3.2 // indirect github.com/cloudflare/circl v1.4.0 // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect github.com/gdamore/encoding v1.0.0 // indirect @@ -28,6 +30,7 @@ require ( github.com/gobwas/pool v0.2.1 // indirect github.com/gobwas/ws v1.4.0 // indirect github.com/golang/snappy v0.0.1 // indirect + github.com/grafov/m3u8 v0.12.0 // indirect github.com/hashicorp/hcl v1.0.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/klauspost/compress v1.17.10 // indirect @@ -62,4 +65,5 @@ require ( golang.org/x/tools v0.25.0 // indirect gopkg.in/ini.v1 v1.67.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect + mvdan.cc/xurls/v2 v2.5.0 // indirect ) diff --git a/go.sum b/go.sum index 709aaa27..de28ef84 100644 --- a/go.sum +++ b/go.sum @@ -2,10 +2,14 @@ github.com/CorentinB/warc v0.8.53 h1:xVz3RMdZ6faAqTtLfcK1/yl8ZTansy+B2en//EZLUlM github.com/CorentinB/warc v0.8.53/go.mod h1:NblONkMtoBB4TIigew6F6vakzu0z3YQTKNFS8U2FIn8= github.com/CorentinB/warc v0.8.54 h1:Y3iK5FEF+m9pKftWf6PXYRC0sYLbwph9j//d9DEwy9g= github.com/CorentinB/warc v0.8.54/go.mod h1:NblONkMtoBB4TIigew6F6vakzu0z3YQTKNFS8U2FIn8= +github.com/PuerkitoBio/goquery v1.10.0 h1:6fiXdLuUvYs2OJSvNRqlNPoBm6YABE226xrbavY5Wv4= +github.com/PuerkitoBio/goquery v1.10.0/go.mod h1:TjZZl68Q3eGHNBA8CWaxAN7rOU1EbDz3CWuolcO5Yu4= github.com/ada-url/goada v0.0.0-20240402045241-5e45a5777313 h1:jdPBTZ3nZwBBZzz5SCFUMcTxoZr8t9ogwdvD3P27f/E= github.com/ada-url/goada v0.0.0-20240402045241-5e45a5777313/go.mod h1:+D/veNwI2mA1hDYLVrYSobYcLFWm6e3DJ/H/d/dxlu8= github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY= +github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= +github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/cloudflare/circl v1.4.0 h1:BV7h5MgrktNzytKmWjpOtdYrf0lkkbF8YMlBGPhJQrY= @@ -44,6 +48,8 @@ github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/grafov/m3u8 v0.12.0 h1:T6iTwTsSEtMcwkayef+FJO8kj+Sglr4Lh81Zj8Ked/4= +github.com/grafov/m3u8 v0.12.0/go.mod h1:nqzOkfBiZJENr52zTVd/Dcl03yzphIMbJqkXGu+u080= github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI= @@ -155,6 +161,7 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= golang.org/x/net v0.31.0 h1:68CPQngjLL0r2AlUKiSxtQFKvzRVbnzLwMUn5SzcLHo= golang.org/x/net v0.31.0/go.mod h1:P4fl1q7dY2hnZFxEk4pPSkDHF+QqjitcnDjUQyMM+pM= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -171,12 +178,14 @@ golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s= golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= golang.org/x/term v0.26.0 h1:WEQa6V3Gja/BhNxg540hBip/kkaYtRg3cxg4oXSw4AU= golang.org/x/term v0.26.0/go.mod h1:Si5m1o57C5nBNQo5z1iq+XDijt21BDBDp2bK0QI8e3E= @@ -184,6 +193,7 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug= golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4= @@ -208,3 +218,5 @@ gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +mvdan.cc/xurls/v2 v2.5.0 h1:lyBNOm8Wo71UknhUs4QTFUNNMyxy2JEIaKKo0RWOh+8= +mvdan.cc/xurls/v2 v2.5.0/go.mod h1:yQgaGQ1rFtJUzkmKiHYSSfuQxqfYmd//X6PxvholpeE= diff --git a/internal/pkg/archiver/archiver.go b/internal/pkg/archiver/archiver.go index 5f921709..e918fc1b 100644 --- a/internal/pkg/archiver/archiver.go +++ b/internal/pkg/archiver/archiver.go @@ -2,7 +2,6 @@ package archiver import ( "context" - "io" "net/http" "sync" @@ -143,14 +142,14 @@ func archive(item *models.Item) { ) // Determines the URLs that need to be captured, if the item's status is fresh we need - // to capture the seed, else if it's a redirection we need to captue it, and + // to capture the seed, else if it's a redirection we need to capture it, and // else we need to capture the child URLs (assets), in parallel if item.GetRedirection() != nil { URLsToCapture = append(URLsToCapture, item.GetRedirection()) - } else if item.GetStatus() == models.ItemPreProcessed { - URLsToCapture = append(URLsToCapture, item.GetURL()) - } else { + } else if len(item.GetChilds()) > 0 { URLsToCapture = item.GetChilds() + } else { + URLsToCapture = append(URLsToCapture, item.GetURL()) } for _, URL := range URLsToCapture { @@ -159,6 +158,7 @@ func archive(item *models.Item) { go func(URL *models.URL) { defer wg.Done() defer func() { <-guard }() + defer stats.URLsCrawledIncr() var ( err error @@ -181,16 +181,8 @@ func archive(item *models.Item) { return } - stats.URLsCrawledIncr() - // Set the response in the item URL.SetResponse(resp) - - // For now, we only consume it - _, err = io.Copy(io.Discard, resp.Body) - if err != nil { - logger.Error("unable to consume response body", "url", URL.String(), "err", err.Error(), "func", "archiver.archive") - } }(URL) } diff --git a/internal/pkg/postprocessor/assets.go b/internal/pkg/postprocessor/assets.go new file mode 100644 index 00000000..8b73cd3e --- /dev/null +++ b/internal/pkg/postprocessor/assets.go @@ -0,0 +1,294 @@ +package postprocessor + +import ( + "regexp" + "strconv" + "strings" + + "github.com/PuerkitoBio/goquery" + "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/internetarchive/Zeno/internal/pkg/postprocessor/extractor" + "github.com/internetarchive/Zeno/internal/pkg/utils" + "github.com/internetarchive/Zeno/pkg/models" + "mvdan.cc/xurls/v2" +) + +var ( + backgroundImageRegex = regexp.MustCompile(`(?:\(['"]?)(.*?)(?:['"]?\))`) + urlRegex = regexp.MustCompile(`(?m)url\((.*?)\)`) + linkRegex = xurls.Relaxed() +) + +func extractAssets(seed *models.Item) (err error) { + var rawAssets []string + + // Build goquery doc from response + doc, err := goquery.NewDocumentFromReader(seed.GetURL().GetResponse().Body) + if err != nil { + return err + } + + // Get assets from JSON payloads in data-item values + doc.Find("[data-item]").Each(func(index int, item *goquery.Selection) { + dataItem, exists := item.Attr("data-item") + if exists { + URLsFromJSON, err := extractor.GetURLsFromJSON([]byte(dataItem)) + if err != nil { + logger.Debug("unable to extract URLs from JSON in data-item attribute", "err", err, "url", seed.GetURL(), "item", seed.GetShortID()) + } else { + rawAssets = append(rawAssets, URLsFromJSON...) + } + } + }) + + // Check all elements style attributes for background-image & also data-preview + doc.Find("*").Each(func(index int, item *goquery.Selection) { + style, exists := item.Attr("style") + if exists { + matches := backgroundImageRegex.FindAllStringSubmatch(style, -1) + + for match := range matches { + if len(matches[match]) > 0 { + matchFound := matches[match][1] + + // Don't extract CSS elements that aren't URLs + if strings.Contains(matchFound, "%") || strings.HasPrefix(matchFound, "0.") || strings.HasPrefix(matchFound, "--font") || strings.HasPrefix(matchFound, "--size") || strings.HasPrefix(matchFound, "--color") || strings.HasPrefix(matchFound, "--shreddit") || strings.HasPrefix(matchFound, "100vh") { + continue + } + + rawAssets = append(rawAssets, matchFound) + } + } + } + + dataPreview, exists := item.Attr("data-preview") + if exists { + if strings.HasPrefix(dataPreview, "http") { + rawAssets = append(rawAssets, dataPreview) + } + } + }) + + // Extract assets on the page (images, scripts, videos..) + if !utils.StringInSlice("img", config.Get().DisableHTMLTag) { + doc.Find("img").Each(func(index int, item *goquery.Selection) { + link, exists := item.Attr("src") + if exists { + rawAssets = append(rawAssets, link) + } + + link, exists = item.Attr("data-src") + if exists { + rawAssets = append(rawAssets, link) + } + + link, exists = item.Attr("data-lazy-src") + if exists { + rawAssets = append(rawAssets, link) + } + + link, exists = item.Attr("data-srcset") + if exists { + links := strings.Split(link, ",") + for _, link := range links { + rawAssets = append(rawAssets, strings.Split(strings.TrimSpace(link), " ")[0]) + } + } + + link, exists = item.Attr("srcset") + if exists { + links := strings.Split(link, ",") + for _, link := range links { + rawAssets = append(rawAssets, strings.Split(strings.TrimSpace(link), " ")[0]) + } + } + }) + } + + if !utils.StringInSlice("video", config.Get().DisableHTMLTag) { + doc.Find("video").Each(func(index int, item *goquery.Selection) { + link, exists := item.Attr("src") + if exists { + rawAssets = append(rawAssets, link) + } + }) + } + + if !utils.StringInSlice("style", config.Get().DisableHTMLTag) { + doc.Find("style").Each(func(index int, item *goquery.Selection) { + matches := urlRegex.FindAllStringSubmatch(item.Text(), -1) + for match := range matches { + matchReplacement := matches[match][1] + matchReplacement = strings.Replace(matchReplacement, "'", "", -1) + matchReplacement = strings.Replace(matchReplacement, "\"", "", -1) + + // If the URL already has http (or https), we don't need add anything to it. + if !strings.Contains(matchReplacement, "http") { + matchReplacement = strings.Replace(matchReplacement, "//", "http://", -1) + } + + if strings.HasPrefix(matchReplacement, "#wp-") { + continue + } + + rawAssets = append(rawAssets, matchReplacement) + } + }) + } + + if !utils.StringInSlice("script", config.Get().DisableHTMLTag) { + doc.Find("script").Each(func(index int, item *goquery.Selection) { + link, exists := item.Attr("src") + if exists { + rawAssets = append(rawAssets, link) + } + + scriptType, exists := item.Attr("type") + if exists { + if scriptType == "application/json" { + URLsFromJSON, err := extractor.GetURLsFromJSON([]byte(item.Text())) + if err != nil { + // TODO: maybe add back when https://github.com/internetarchive/Zeno/issues/147 is fixed + // c.Log.Debug("unable to extract URLs from JSON in script tag", "error", err, "url", URL) + } else { + rawAssets = append(rawAssets, URLsFromJSON...) + } + } + } + + // Apply regex on the script's HTML to extract potential assets + outerHTML, err := goquery.OuterHtml(item) + if err != nil { + logger.Debug("unable to extract outer HTML from script tag", "err", err, "url", seed.GetURL(), "item", seed.GetShortID()) + } else { + scriptLinks := utils.DedupeStrings(linkRegex.FindAllString(outerHTML, -1)) + for _, scriptLink := range scriptLinks { + if strings.HasPrefix(scriptLink, "http") { + // Escape URLs when unicode runes are present in the extracted URLs + scriptLink, err := strconv.Unquote(`"` + scriptLink + `"`) + if err != nil { + logger.Debug("unable to escape URL from JSON in script tag", "error", err, "url", scriptLink, "item", seed.GetShortID()) + continue + } + rawAssets = append(rawAssets, scriptLink) + } + } + } + + // Some $10,000 Every Day You Survive In A Grocery Store - YouTube
PrésentationPresseDroits d'auteurNous contacterCréateursPublicitéDéveloppeursRésilier vos abonnementsConditions d'utilisationConfidentialitéRègles et sécuritéPremiers pas sur YouTubeTester de nouvelles fonctionnalités
\ No newline at end of file diff --git a/.old/internal/pkg/crawl/link_header.go b/internal/pkg/postprocessor/extractor/link_header.go similarity index 60% rename from .old/internal/pkg/crawl/link_header.go rename to internal/pkg/postprocessor/extractor/link_header.go index abd07111..88fa4191 100644 --- a/.old/internal/pkg/crawl/link_header.go +++ b/internal/pkg/postprocessor/extractor/link_header.go @@ -1,55 +1,56 @@ -package crawl +package extractor import ( "strings" -) -// Represents a Link struct, containing a URL to which it links, and a Rel to define the relation -type Link struct { - URL string - Rel string -} + "github.com/internetarchive/Zeno/pkg/models" +) -// Parse parses a raw Link header in the form: +// ExtractURLsFromHeader parses a raw Link header in the form: // // ; rel="what", ; rel="any"; another="yes", ; rel="thing" // -// returning a slice of Link structs +// returning a slice of models.URL structs // Each of these are separated by a `, ` and the in turn by a `; `, with the first always being the url, and the remaining the key-val pairs // See: https://simon-frey.com/blog/link-header/, https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Link -func Parse(link string) []Link { - var links []Link +func ExtractURLsFromHeader(link string) (URLs []*models.URL) { + if link == "" { + return URLs + } for _, link := range strings.Split(link, ", ") { parts := strings.Split(link, ";") if len(parts) < 1 { - // Malformed input, somehow we didn't get atleast one part + // Malformed input, somehow we didn't get at least one part continue } - url := strings.TrimSpace(strings.Trim(parts[0], "<>")) - rel := "" + URL := strings.TrimSpace(strings.Trim(parts[0], "<>")) + if URL == "" { + // Malformed input, URL is empty + continue + } for _, attrs := range parts[1:] { - key, value := ParseAttr(attrs) + key, _ := parseAttr(attrs) if key == "" { // Malformed input, somehow the key is nothing continue } if key == "rel" { - rel = value break } } - links = append(links, Link{URL: url, Rel: rel}) + + URLs = append(URLs, &models.URL{Raw: URL}) } - return links + return URLs } // Parse a single attribute key value pair and return it -func ParseAttr(attrs string) (key, value string) { +func parseAttr(attrs string) (key, value string) { kv := strings.SplitN(attrs, "=", 2) if len(kv) != 2 { diff --git a/internal/pkg/postprocessor/extractor/link_header_test.go b/internal/pkg/postprocessor/extractor/link_header_test.go new file mode 100644 index 00000000..113bac38 --- /dev/null +++ b/internal/pkg/postprocessor/extractor/link_header_test.go @@ -0,0 +1,133 @@ +package extractor + +import ( + "fmt" + "reflect" + "testing" + + "github.com/internetarchive/Zeno/pkg/models" +) + +func TestExtractURLsFromHeader(t *testing.T) { + tests := []struct { + name string + link string + expected []*models.URL + }{ + { + name: "Valid Link header with multiple URLs", + link: `; rel="preconnect", ; rel="preload"`, + expected: []*models.URL{ + {Raw: "https://one.example.com"}, + {Raw: "https://two.example.com"}, + }, + }, + { + name: "Valid Link header with no URLs", + link: ``, + expected: nil, + }, + { + name: "Malformed Link header", + link: `https://one.example.com>;; rel=preconnect";`, + expected: []*models.URL{ + {Raw: "https://one.example.com"}, + }, + }, + { + name: "Link header with nested elements containing URLs", + link: `; rel="preconnect"`, + expected: []*models.URL{ + {Raw: "https://example.com/nested"}, + }, + }, + { + name: "Link header with attributes containing URLs", + link: `; rel="preconnect"`, + expected: []*models.URL{ + {Raw: "https://example.com/attr"}, + }, + }, + { + name: "Link header with mixed content", + link: `; rel="preconnect"`, + expected: []*models.URL{ + {Raw: "https://example.com/mixed"}, + }, + }, + { + name: "Large Link header content", + link: func() string { + var link string + for i := 0; i < 1000; i++ { + link += fmt.Sprintf("; rel=\"preconnect\", ", i) + } + return link[:len(link)-2] + }(), + expected: func() []*models.URL { + var urls []*models.URL + for i := 0; i < 1000; i++ { + urls = append(urls, &models.URL{Raw: fmt.Sprintf("https://example.com/page%d", i)}) + } + return urls + }(), + }, + { + name: "Link header with special characters in URLs", + link: `; rel="preconnect"`, + expected: []*models.URL{ + {Raw: "https://example.com/page?param=1&other=2"}, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := ExtractURLsFromHeader(tt.link) + if !reflect.DeepEqual(got, tt.expected) { + t.Fatalf("ExtractURLsFromHeader() = %v, want %v", got, tt.expected) + } + }) + } +} + +func TestParseAttr(t *testing.T) { + tests := []struct { + attr string + wantKey string + wantValue string + }{ + { + attr: `rel="preconnect"`, + wantKey: "rel", + wantValue: "preconnect", + }, + { + attr: `="preconnect"`, + wantKey: "", + wantValue: "preconnect", + }, + { + attr: `foo="bar"`, + wantKey: "foo", + wantValue: "bar", + }, + { + attr: `key="value"`, + wantKey: "key", + wantValue: "value", + }, + } + + for _, tt := range tests { + t.Run(tt.attr, func(t *testing.T) { + gotKey, gotValue := parseAttr(tt.attr) + if gotKey != tt.wantKey { + t.Fatalf("parseAttr() gotKey = %v, want %v", gotKey, tt.wantKey) + } + if gotValue != tt.wantValue { + t.Fatalf("parseAttr() gotValue = %v, want %v", gotValue, tt.wantValue) + } + }) + } +} diff --git a/internal/pkg/postprocessor/outlinks.go b/internal/pkg/postprocessor/outlinks.go index 7aefec5d..e5481e5a 100644 --- a/internal/pkg/postprocessor/outlinks.go +++ b/internal/pkg/postprocessor/outlinks.go @@ -36,6 +36,12 @@ func extractOutlinks(URL *models.URL, item *models.Item) (outlinks []*models.URL logger.Debug("no extractor used for page", "content-type", contentType, "item", item.GetShortID()) } + // Try to extract links from link headers + linksFromLinkHeader := extractor.ExtractURLsFromHeader(URL.GetResponse().Header.Get("link")) + if linksFromLinkHeader != nil { + outlinks = append(outlinks, linksFromLinkHeader...) + } + // If the page is a text/* content type, extract links from the body (aggressively) if strings.Contains(contentType, "text/") { outlinks = append(outlinks, extractLinksFromPage(URL)...) From abb57b6abc0443d396682c6750799102cc1861e7 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Fri, 10 Jan 2025 15:12:06 +0100 Subject: [PATCH 195/295] add: ina.fr site-specific code --- .old/internal/pkg/crawl/capture.go | 20 -- internal/pkg/postprocessor/assets.go | 7 + .../pkg/postprocessor/sitespecific/ina/ina.go | 202 ++++++++++++++++++ 3 files changed, 209 insertions(+), 20 deletions(-) create mode 100644 internal/pkg/postprocessor/sitespecific/ina/ina.go diff --git a/.old/internal/pkg/crawl/capture.go b/.old/internal/pkg/crawl/capture.go index 1d0dcaee..6171cb1d 100644 --- a/.old/internal/pkg/crawl/capture.go +++ b/.old/internal/pkg/crawl/capture.go @@ -414,26 +414,6 @@ func (c *Crawl) Capture(item *queue.Item) error { } return nil - } else if ina.IsAPIURL(req) { - rawAssets, err := ina.ExtractMedias(resp) - if err != nil { - c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract medias from INA") - } - - if len(rawAssets) != 0 { - assets = c.seencheckAssets(rawAssets, item) - - if len(assets) != 0 { - for _, asset := range rawAssets { - playerItem, err := queue.NewItem(asset, item.URL, "seed", 0, "", false) - if err != nil { - c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to create new item from asset") - } else { - c.Capture(playerItem) - } - } - } - } } // Scrape potential URLs from Link HTTP header diff --git a/internal/pkg/postprocessor/assets.go b/internal/pkg/postprocessor/assets.go index ae5a2945..1cd531e5 100644 --- a/internal/pkg/postprocessor/assets.go +++ b/internal/pkg/postprocessor/assets.go @@ -4,6 +4,7 @@ import ( "github.com/PuerkitoBio/goquery" "github.com/internetarchive/Zeno/internal/pkg/log" "github.com/internetarchive/Zeno/internal/pkg/postprocessor/extractor" + "github.com/internetarchive/Zeno/internal/pkg/postprocessor/sitespecific/ina" "github.com/internetarchive/Zeno/pkg/models" ) @@ -19,6 +20,12 @@ func extractAssets(doc *goquery.Document, URL *models.URL, item *models.Item) (a switch { // Order is important, we want to check for more specific things first, // as they may trigger more general extractors (e.g. HTML) + case ina.IsAPIURL(URL): + assets, err := ina.ExtractMedias(URL) + if err != nil { + logger.Error("unable to extract medias from INA", "err", err.Error(), "item", item.GetShortID()) + return assets, err + } case extractor.IsM3U8(URL): assets, err = extractor.M3U8(URL) if err != nil { diff --git a/internal/pkg/postprocessor/sitespecific/ina/ina.go b/internal/pkg/postprocessor/sitespecific/ina/ina.go new file mode 100644 index 00000000..c46acf0c --- /dev/null +++ b/internal/pkg/postprocessor/sitespecific/ina/ina.go @@ -0,0 +1,202 @@ +package ina + +import ( + "encoding/json" + "io" + "net/http" + "net/url" + "regexp" + "strings" + "sync" + "time" + + "github.com/CorentinB/warc" + "github.com/PuerkitoBio/goquery" + "github.com/internetarchive/Zeno/internal/pkg/utils" + "github.com/internetarchive/Zeno/pkg/models" +) + +var ( + playerVersion string + playerVersionLock sync.Mutex + playerRegex *regexp.Regexp +) + +func init() { + playerRegex = regexp.MustCompile(`"//ssl\.p\.jwpcdn\.com[^"]+\.js"`) +} + +type APIResponse struct { + ID string `json:"id"` + Title string `json:"title"` + Description string `json:"description"` + DateOfBroadcast time.Time `json:"dateOfBroadcast"` + Type string `json:"type"` + Duration int `json:"duration"` + Categories []any `json:"categories"` + Credits []struct { + Context struct { + Vocab string `json:"@vocab"` + Hydra string `json:"hydra"` + Name string `json:"name"` + Value string `json:"value"` + Attributes string `json:"attributes"` + } `json:"@context"` + Type string `json:"@type"` + ID string `json:"@id"` + Name string `json:"name"` + Value string `json:"value"` + Attributes []struct { + Context struct { + Vocab string `json:"@vocab"` + Hydra string `json:"hydra"` + Key string `json:"key"` + Value string `json:"value"` + } `json:"@context"` + Type string `json:"@type"` + ID string `json:"@id"` + Key string `json:"key"` + Value string `json:"value"` + } `json:"attributes"` + } `json:"credits"` + Restrictions []any `json:"restrictions"` + ResourceURL string `json:"resourceUrl"` + ResourceThumbnail string `json:"resourceThumbnail"` + RestrictedBroadcastCountries []any `json:"restrictedBroadcastCountries"` + EmbedURL string `json:"embedUrl"` + AllowEmbed bool `json:"allowEmbed"` + Ratio string `json:"ratio"` + CollectionTitle string `json:"collectionTitle"` + IsOnline bool `json:"isOnline"` + AllowAds bool `json:"allowAds"` + TypeMedia string `json:"typeMedia"` + HideLogo bool `json:"hideLogo"` + URI string `json:"uri"` + AdvertisingAsset bool `json:"advertisingAsset"` +} + +func IsURL(URL *models.URL) bool { + return strings.Contains(URL.String(), "ina.fr") +} + +func IsAPIURL(URL *models.URL) bool { + return strings.Contains(URL.String(), "apipartner.ina.fr") && !strings.Contains(URL.String(), "playerConfigurations.json") +} + +func ExtractPlayerURLs(doc *goquery.Document, c *warc.CustomHTTPClient) []*url.URL { + var assets []string + + doc.Find("div[data-type=player]").Each(func(i int, s *goquery.Selection) { + if playerConfigURL, exists := s.Attr("config-url"); exists { + assets = append(assets, playerConfigURL) + } + + if assetDetailsURL, exists := s.Attr("asset-details-url"); exists { + assets = append(assets, assetDetailsURL) + } + + if posterURL, exists := s.Attr("poster"); exists { + assets = append(assets, posterURL) + } + }) + + assets = append(assets, getJWPlayerURLs(c)...) + + return utils.StringSliceToURLSlice(assets) +} + +func getJWPlayerURLs(c *warc.CustomHTTPClient) (URLs []string) { + playerVersionLock.Lock() + defer playerVersionLock.Unlock() + + if playerVersion == "" { + resp, err := c.Get("https://player-hub.ina.fr/version") + if err != nil { + return URLs + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return URLs + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + return URLs + } + + playerVersion = string(body) + + URLs = append(URLs, + "https://player-hub.ina.fr/dist/ina-player.min.js?version="+playerVersion, + "https://player-hub.ina.fr/dist/player-default-skin.min.css?version="+playerVersion, + "https://player-hub.ina.fr/assets/player/svg/pause.svg", + "https://player-hub.ina.fr/assets/player/svg/play.svg", + "https://player-hub.ina.fr/assets/player/svg/backward.svg", + "https://player-hub.ina.fr/assets/player/svg/forward.svg", + ) + + // Get the JWPlayer JS code + playerResp, err := c.Get("https://player-hub.ina.fr/js/jwplayer/jwplayer.js?version=" + playerVersion) + if err != nil { + return URLs + } + defer playerResp.Body.Close() + + if playerResp.StatusCode != http.StatusOK { + return URLs + } + + // Find the JWPlayer assets in the JS file + body, err = io.ReadAll(playerResp.Body) + if err != nil { + return URLs + } + + matches := playerRegex.FindAllString(string(body), -1) + + // Clean up the matches (remove quotes) + for _, match := range matches { + URLs = append(URLs, "https:"+match[1:len(match)-1]) + } + + URLs = append(URLs, "https://ssl.p.jwpcdn.com/player/v/"+extractJWPlayerVersion(string(body))+"/jwplayer.core.controls.html5.js") + } + + return URLs +} + +func extractJWPlayerVersion(body string) string { + lines := strings.Split(body, "\n") + for _, line := range lines { + if strings.Contains(line, "JW Player version") { + return strings.Split(line, "JW Player version ")[1] + } + } + return "" +} + +func ExtractMedias(URL *models.URL) (assets []*models.URL, err error) { + defer URL.RewindBody() + + body, err := io.ReadAll(URL.GetBody()) + if err != nil { + return nil, err + } + + var data APIResponse + err = json.Unmarshal(body, &data) + if err != nil { + return nil, err + } + + for _, rawAsset := range []string{ + data.ResourceURL, + data.ResourceThumbnail, + "https://player.ina.fr" + data.EmbedURL, data.URI, + } { + assets = append(assets, &models.URL{Raw: rawAsset}) + } + + return assets, nil +} From da620815ed379fbcd7ffc8438a4b01b72399d119 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Fri, 10 Jan 2025 15:25:15 +0100 Subject: [PATCH 196/295] add: facebook.com post.php URL generation --- internal/pkg/postprocessor/postprocessor.go | 16 +++++++++++++++ .../sitespecific/facebook/facebook.go | 20 +++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 internal/pkg/postprocessor/sitespecific/facebook/facebook.go diff --git a/internal/pkg/postprocessor/postprocessor.go b/internal/pkg/postprocessor/postprocessor.go index 6043e189..84c67f4c 100644 --- a/internal/pkg/postprocessor/postprocessor.go +++ b/internal/pkg/postprocessor/postprocessor.go @@ -9,6 +9,7 @@ import ( "github.com/internetarchive/Zeno/internal/pkg/config" "github.com/internetarchive/Zeno/internal/pkg/controler/pause" "github.com/internetarchive/Zeno/internal/pkg/log" + "github.com/internetarchive/Zeno/internal/pkg/postprocessor/sitespecific/facebook" "github.com/internetarchive/Zeno/internal/pkg/stats" "github.com/internetarchive/Zeno/pkg/models" ) @@ -178,6 +179,21 @@ func postprocess(item *models.Item) (outlinks []*models.Item) { continue } + // Execute site-specific post-processing + switch { + case facebook.IsFacebookPostURL(items[i].GetURL()): + err := items[i].AddChild( + models.NewItem( + uuid.New().String(), + facebook.GenerateEmbedURL(items[i].GetURL()), + items[i].GetURL().String(), + true, + ), models.ItemGotChildren) + if err != nil { + panic(err) + } + } + // Return if: // - the item is a child and the URL has more than one hop // - assets capture is disabled and domains crawl is disabled diff --git a/internal/pkg/postprocessor/sitespecific/facebook/facebook.go b/internal/pkg/postprocessor/sitespecific/facebook/facebook.go new file mode 100644 index 00000000..a55c81de --- /dev/null +++ b/internal/pkg/postprocessor/sitespecific/facebook/facebook.go @@ -0,0 +1,20 @@ +package facebook + +import ( + "fmt" + "net/url" + "strings" + + "github.com/internetarchive/Zeno/pkg/models" +) + +func IsFacebookPostURL(URL *models.URL) bool { + return strings.Contains(URL.String(), "facebook.com") && strings.Contains(URL.String(), "/posts/") +} + +func GenerateEmbedURL(URL *models.URL) *models.URL { + return &models.URL{ + Raw: fmt.Sprintf("https://www.facebook.com/plugins/post.php?href=%s&show_text=true", url.QueryEscape(URL.String())), + Hops: URL.GetHops(), + } +} From 8e142c12fb0c9efdd6988313dcb411793a10429b Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Fri, 10 Jan 2025 15:36:24 +0100 Subject: [PATCH 197/295] add: tiktok.com preprocessing + config UA header --- internal/pkg/preprocessor/preprocessor.go | 11 +++++++- .../sitespecific/tiktok/tiktok.go | 28 +++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 internal/pkg/preprocessor/sitespecific/tiktok/tiktok.go diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go index 54d4048c..5daf91e6 100644 --- a/internal/pkg/preprocessor/preprocessor.go +++ b/internal/pkg/preprocessor/preprocessor.go @@ -10,6 +10,7 @@ import ( "github.com/internetarchive/Zeno/internal/pkg/controler/pause" "github.com/internetarchive/Zeno/internal/pkg/log" "github.com/internetarchive/Zeno/internal/pkg/preprocessor/seencheck" + "github.com/internetarchive/Zeno/internal/pkg/preprocessor/sitespecific/tiktok" "github.com/internetarchive/Zeno/internal/pkg/source/hq" "github.com/internetarchive/Zeno/internal/pkg/stats" "github.com/internetarchive/Zeno/pkg/models" @@ -159,6 +160,7 @@ func preprocess(item *models.Item) { continue } } + // TODO : normalize seeds // // else { @@ -236,7 +238,6 @@ func preprocess(item *models.Item) { // Finally, we build the requests, applying any site-specific behavior needed for i := range items { - // TODO: apply site-specific stuff req, err := http.NewRequest(http.MethodGet, items[i].GetURL().String(), nil) if err != nil { logger.Error("unable to create request for URL", "url", items[i].GetURL().String(), "err", err.Error()) @@ -244,6 +245,14 @@ func preprocess(item *models.Item) { continue } + // Apply configured User-Agent + req.Header.Set("User-Agent", config.Get().UserAgent) + + switch { + case tiktok.IsTikTokURL(items[i].GetURL()): + tiktok.AddHeaders(req) + } + items[i].GetURL().SetRequest(req) items[i].SetStatus(models.ItemPreProcessed) } diff --git a/internal/pkg/preprocessor/sitespecific/tiktok/tiktok.go b/internal/pkg/preprocessor/sitespecific/tiktok/tiktok.go new file mode 100644 index 00000000..9efc1316 --- /dev/null +++ b/internal/pkg/preprocessor/sitespecific/tiktok/tiktok.go @@ -0,0 +1,28 @@ +package tiktok + +import ( + "net/http" + "strings" + + "github.com/internetarchive/Zeno/pkg/models" +) + +func IsTikTokURL(URL *models.URL) bool { + return strings.Contains(URL.String(), "tiktok.com/") +} + +func AddHeaders(req *http.Request) { + req.Header.Set("Authority", "www.tiktok.com") + req.Header.Set("Sec-Ch-Ua", "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"99\", \"Microsoft Edge\";v=\"99\"") + req.Header.Set("Sec-Ch-Ua-Mobile", "?0") + req.Header.Set("Sec-Ch-Ua-Platform", "\"Linux\"") + req.Header.Set("Dnt", "1") + req.Header.Set("Upgrade-Insecure-Requests", "1") + req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.52") + req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9") + req.Header.Set("Sec-Fetch-Site", "none") + req.Header.Set("Sec-Fetch-Mode", "navigate") + req.Header.Set("Sec-Fetch-User", "?1") + req.Header.Set("Sec-Fetch-Dest", "document") + req.Header.Set("Accept-Language", "en-US,en;q=0.9,fr;q=0.8") +} From 5e8a1f8e7b2f30668c4c57ad0dc0710c721a7446 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Fri, 10 Jan 2025 22:45:07 +0100 Subject: [PATCH 198/295] fix: tags being treated as assets instead of outlinks --- internal/pkg/postprocessor/assets.go | 2 +- internal/pkg/postprocessor/extractor/html.go | 91 ++++++++++++-------- internal/pkg/postprocessor/outlinks.go | 9 +- internal/pkg/postprocessor/postprocessor.go | 2 +- 4 files changed, 65 insertions(+), 39 deletions(-) diff --git a/internal/pkg/postprocessor/assets.go b/internal/pkg/postprocessor/assets.go index 1cd531e5..e2e0589d 100644 --- a/internal/pkg/postprocessor/assets.go +++ b/internal/pkg/postprocessor/assets.go @@ -45,7 +45,7 @@ func extractAssets(doc *goquery.Document, URL *models.URL, item *models.Item) (a return assets, err } case extractor.IsHTML(URL): - assets, err = extractor.HTML(doc, URL, item) + assets, err = extractor.HTMLAssets(doc, URL, item) if err != nil { logger.Error("unable to extract assets", "err", err.Error(), "item", item.GetShortID()) return assets, err diff --git a/internal/pkg/postprocessor/extractor/html.go b/internal/pkg/postprocessor/extractor/html.go index 25969cc8..5d751df0 100644 --- a/internal/pkg/postprocessor/extractor/html.go +++ b/internal/pkg/postprocessor/extractor/html.go @@ -21,11 +21,64 @@ func IsHTML(URL *models.URL) bool { return isContentType(URL.GetResponse().Header.Get("Content-Type"), "html") } -func HTML(doc *goquery.Document, URL *models.URL, seed *models.Item) (assets []*models.URL, err error) { +func HTMLOutlinks(doc *goquery.Document, URL *models.URL) (outlinks []*models.URL, err error) { + defer URL.RewindBody() + + // logger := log.NewFieldedLogger(&log.Fields{ + // "component": "postprocessor.extractor.HTMLOutlinks", + // }) + + var rawOutlinks []string + + // Match tags with href, data-href, data-src, data-srcset, data-lazy-src, data-srcset, src, srcset + if !utils.StringInSlice("a", config.Get().DisableHTMLTag) { + var validAssetPath = []string{ + "static/", + "assets/", + "asset/", + "images/", + "image/", + "img/", + } + + var validAssetAttributes = []string{ + "href", + "data-href", + "data-src", + "data-srcset", + "data-lazy-src", + "data-srcset", + "src", + "srcset", + } + + doc.Find("a").Each(func(index int, item *goquery.Selection) { + for _, attr := range validAssetAttributes { + link, exists := item.Attr(attr) + if exists { + if utils.StringContainsSliceElements(link, validAssetPath) { + rawOutlinks = append(rawOutlinks, link) + } + } + } + }) + } + + for _, rawOutlink := range rawOutlinks { + outlinks = append(outlinks, &models.URL{ + Raw: rawOutlink, + Hops: URL.GetHops() + 1, + }) + } + + return outlinks, nil +} + +func HTMLAssets(doc *goquery.Document, URL *models.URL, seed *models.Item) (assets []*models.URL, err error) { defer URL.RewindBody() logger := log.NewFieldedLogger(&log.Fields{ - "component": "postprocessor.extractor.HTML", + "component": "postprocessor.extractor.HTMLAssets", }) var rawAssets []string @@ -71,40 +124,6 @@ func HTML(doc *goquery.Document, URL *models.URL, seed *models.Item) (assets []* } }) - // Match tags with href, data-href, data-src, data-srcset, data-lazy-src, data-srcset, src, srcset - if !utils.StringInSlice("a", config.Get().DisableHTMLTag) { - var validAssetPath = []string{ - "static/", - "assets/", - "asset/", - "images/", - "image/", - "img/", - } - - var validAssetAttributes = []string{ - "href", - "data-href", - "data-src", - "data-srcset", - "data-lazy-src", - "data-srcset", - "src", - "srcset", - } - - doc.Find("a").Each(func(index int, item *goquery.Selection) { - for _, attr := range validAssetAttributes { - link, exists := item.Attr(attr) - if exists { - if utils.StringContainsSliceElements(link, validAssetPath) { - rawAssets = append(rawAssets, link) - } - } - } - }) - } - // Extract assets on the page (images, scripts, videos..) if !utils.StringInSlice("img", config.Get().DisableHTMLTag) { doc.Find("img").Each(func(index int, item *goquery.Selection) { diff --git a/internal/pkg/postprocessor/outlinks.go b/internal/pkg/postprocessor/outlinks.go index e5481e5a..c30cbb0f 100644 --- a/internal/pkg/postprocessor/outlinks.go +++ b/internal/pkg/postprocessor/outlinks.go @@ -4,13 +4,14 @@ import ( "io" "strings" + "github.com/PuerkitoBio/goquery" "github.com/internetarchive/Zeno/internal/pkg/log" "github.com/internetarchive/Zeno/internal/pkg/postprocessor/extractor" "github.com/internetarchive/Zeno/internal/pkg/utils" "github.com/internetarchive/Zeno/pkg/models" ) -func extractOutlinks(URL *models.URL, item *models.Item) (outlinks []*models.URL, err error) { +func extractOutlinks(doc *goquery.Document, URL *models.URL, item *models.Item) (outlinks []*models.URL, err error) { var ( contentType = URL.GetResponse().Header.Get("Content-Type") logger = log.NewFieldedLogger(&log.Fields{ @@ -32,6 +33,12 @@ func extractOutlinks(URL *models.URL, item *models.Item) (outlinks []*models.URL logger.Error("unable to extract outlinks", "err", err.Error(), "item", item.GetShortID()) return outlinks, err } + case extractor.IsHTML(URL): + outlinks, err := extractor.HTMLOutlinks(doc, URL) + if err != nil { + logger.Error("unable to extract outlinks", "err", err.Error(), "item", item.GetShortID()) + return outlinks, err + } default: logger.Debug("no extractor used for page", "content-type", contentType, "item", item.GetShortID()) } diff --git a/internal/pkg/postprocessor/postprocessor.go b/internal/pkg/postprocessor/postprocessor.go index 84c67f4c..791f2c3d 100644 --- a/internal/pkg/postprocessor/postprocessor.go +++ b/internal/pkg/postprocessor/postprocessor.go @@ -239,7 +239,7 @@ func postprocess(item *models.Item) (outlinks []*models.Item) { // Extract outlinks from the page if config.Get().DomainsCrawl || ((items[i].IsSeed() || items[i].IsRedirection()) && items[i].GetURL().GetHops() < config.Get().MaxHops) { logger.Info("extracting outlinks", "item", items[i].GetShortID()) - links, err := extractOutlinks(items[i].GetURL(), items[i]) + links, err := extractOutlinks(doc, items[i].GetURL(), items[i]) if err != nil { logger.Error("unable to extract outlinks", "err", err.Error(), "item", items[i].GetShortID()) continue From 883d04cc5c6a7f0fce614f093909fdc7178b7b42 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Fri, 10 Jan 2025 22:53:48 +0100 Subject: [PATCH 199/295] add: URL exclusion --- internal/pkg/preprocessor/preprocessor.go | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go index 5daf91e6..9c13c909 100644 --- a/internal/pkg/preprocessor/preprocessor.go +++ b/internal/pkg/preprocessor/preprocessor.go @@ -13,6 +13,7 @@ import ( "github.com/internetarchive/Zeno/internal/pkg/preprocessor/sitespecific/tiktok" "github.com/internetarchive/Zeno/internal/pkg/source/hq" "github.com/internetarchive/Zeno/internal/pkg/stats" + "github.com/internetarchive/Zeno/internal/pkg/utils" "github.com/internetarchive/Zeno/pkg/models" ) @@ -159,9 +160,22 @@ func preprocess(item *models.Item) { items[i].GetParent().RemoveChild(items[i]) continue } + + // Verify if the URL isn't to be excluded + if utils.StringContainsSliceElements(items[i].GetURL().GetParsed().Host, config.Get().ExcludeHosts) { + logger.Warn("URL excluded", "url", items[i].GetURL().String()) + items[i].GetParent().RemoveChild(items[i]) + continue + } + + if utils.StringContainsSliceElements(items[i].GetURL().GetParsed().Path, config.Get().ExcludeString) { + logger.Warn("URL excluded", "url", items[i].GetURL().String()) + items[i].GetParent().RemoveChild(items[i]) + continue + } } - // TODO : normalize seeds + // TODO : normalize seeds and apply exclusions to seeds // // else { // err := normalizeURL(items[i].GetURL(), &models.URL{Raw: items[i].GetSeedVia()}) From 3e9bad1b537d65e54d192750ef5f5e6da86c7570 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Sun, 12 Jan 2025 12:39:50 +0100 Subject: [PATCH 200/295] fix: ensure outlinks & assets hops are set to "parent hop" + 1 --- internal/pkg/postprocessor/assets.go | 2 +- internal/pkg/postprocessor/extractor/html.go | 2 +- internal/pkg/postprocessor/extractor/json.go | 2 +- .../postprocessor/extractor/link_header.go | 13 +++++++--- .../extractor/link_header_test.go | 25 ++++++++++++++++++- internal/pkg/postprocessor/extractor/m3u8.go | 2 +- internal/pkg/postprocessor/extractor/xml.go | 9 ++----- .../pkg/postprocessor/extractor/xml_test.go | 2 +- internal/pkg/postprocessor/outlinks.go | 4 +-- 9 files changed, 42 insertions(+), 19 deletions(-) diff --git a/internal/pkg/postprocessor/assets.go b/internal/pkg/postprocessor/assets.go index e2e0589d..2199a843 100644 --- a/internal/pkg/postprocessor/assets.go +++ b/internal/pkg/postprocessor/assets.go @@ -39,7 +39,7 @@ func extractAssets(doc *goquery.Document, URL *models.URL, item *models.Item) (a return assets, err } case extractor.IsXML(URL): - assets, err = extractor.XML(URL, false) + assets, err = extractor.XML(URL) if err != nil { logger.Error("unable to extract assets", "err", err.Error(), "item", item.GetShortID()) return assets, err diff --git a/internal/pkg/postprocessor/extractor/html.go b/internal/pkg/postprocessor/extractor/html.go index 5d751df0..31ae5409 100644 --- a/internal/pkg/postprocessor/extractor/html.go +++ b/internal/pkg/postprocessor/extractor/html.go @@ -341,7 +341,7 @@ func HTMLAssets(doc *goquery.Document, URL *models.URL, seed *models.Item) (asse for _, rawAsset := range rawAssets { assets = append(assets, &models.URL{ Raw: rawAsset, - Hops: URL.GetHops(), + Hops: URL.GetHops() + 1, }) } diff --git a/internal/pkg/postprocessor/extractor/json.go b/internal/pkg/postprocessor/extractor/json.go index c21bba74..c1962cc4 100644 --- a/internal/pkg/postprocessor/extractor/json.go +++ b/internal/pkg/postprocessor/extractor/json.go @@ -28,7 +28,7 @@ func JSON(URL *models.URL) (assets []*models.URL, err error) { for _, rawAsset := range rawAssets { assets = append(assets, &models.URL{ Raw: rawAsset, - Hops: URL.GetHops(), + Hops: URL.GetHops() + 1, }) } diff --git a/internal/pkg/postprocessor/extractor/link_header.go b/internal/pkg/postprocessor/extractor/link_header.go index 88fa4191..1b992f07 100644 --- a/internal/pkg/postprocessor/extractor/link_header.go +++ b/internal/pkg/postprocessor/extractor/link_header.go @@ -13,7 +13,9 @@ import ( // returning a slice of models.URL structs // Each of these are separated by a `, ` and the in turn by a `; `, with the first always being the url, and the remaining the key-val pairs // See: https://simon-frey.com/blog/link-header/, https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Link -func ExtractURLsFromHeader(link string) (URLs []*models.URL) { +func ExtractURLsFromHeader(URL *models.URL) (URLs []*models.URL) { + var link = URL.GetResponse().Header.Get("link") + if link == "" { return URLs } @@ -25,8 +27,8 @@ func ExtractURLsFromHeader(link string) (URLs []*models.URL) { continue } - URL := strings.TrimSpace(strings.Trim(parts[0], "<>")) - if URL == "" { + extractedURL := strings.TrimSpace(strings.Trim(parts[0], "<>")) + if extractedURL == "" { // Malformed input, URL is empty continue } @@ -43,7 +45,10 @@ func ExtractURLsFromHeader(link string) (URLs []*models.URL) { } } - URLs = append(URLs, &models.URL{Raw: URL}) + URLs = append(URLs, &models.URL{ + Raw: extractedURL, + Hops: URL.GetHops() + 1, + }) } return URLs diff --git a/internal/pkg/postprocessor/extractor/link_header_test.go b/internal/pkg/postprocessor/extractor/link_header_test.go index 113bac38..1482730c 100644 --- a/internal/pkg/postprocessor/extractor/link_header_test.go +++ b/internal/pkg/postprocessor/extractor/link_header_test.go @@ -1,7 +1,10 @@ package extractor import ( + "bytes" "fmt" + "io" + "net/http" "reflect" "testing" @@ -83,7 +86,27 @@ func TestExtractURLsFromHeader(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got := ExtractURLsFromHeader(tt.link) + resp := &http.Response{ + Body: io.NopCloser(bytes.NewBufferString("")), + Header: http.Header{ + "Link": []string{tt.link}, + }, + } + + var URL = new(models.URL) + URL.SetResponse(resp) + + // Consume the response body + body := bytes.NewBuffer(nil) + _, err := io.Copy(body, resp.Body) + if err != nil { + t.Errorf("unable to read response body: %v", err) + } + + // Set the body in the URL + URL.SetBody(bytes.NewReader(body.Bytes())) + + got := ExtractURLsFromHeader(URL) if !reflect.DeepEqual(got, tt.expected) { t.Fatalf("ExtractURLsFromHeader() = %v, want %v", got, tt.expected) } diff --git a/internal/pkg/postprocessor/extractor/m3u8.go b/internal/pkg/postprocessor/extractor/m3u8.go index 39370c92..25bd4374 100644 --- a/internal/pkg/postprocessor/extractor/m3u8.go +++ b/internal/pkg/postprocessor/extractor/m3u8.go @@ -50,7 +50,7 @@ func M3U8(URL *models.URL) (assets []*models.URL, err error) { for _, rawAsset := range rawAssets { assets = append(assets, &models.URL{ Raw: rawAsset, - Hops: URL.GetHops(), + Hops: URL.GetHops() + 1, }) } diff --git a/internal/pkg/postprocessor/extractor/xml.go b/internal/pkg/postprocessor/extractor/xml.go index 330fdf3a..f9f98aa9 100644 --- a/internal/pkg/postprocessor/extractor/xml.go +++ b/internal/pkg/postprocessor/extractor/xml.go @@ -28,7 +28,7 @@ func IsSitemapXML(URL *models.URL) bool { return isContentType(URL.GetResponse().Header.Get("Content-Type"), "xml") && bytes.Contains(xmlBody, sitemapMarker) } -func XML(URL *models.URL, sitemap bool) (assets []*models.URL, err error) { +func XML(URL *models.URL) (assets []*models.URL, err error) { defer URL.RewindBody() xmlBody, err := io.ReadAll(URL.GetBody()) @@ -75,15 +75,10 @@ func XML(URL *models.URL, sitemap bool) (assets []*models.URL, err error) { } } - var hops = URL.GetHops() - if sitemap { - hops += 1 - } - for _, rawAsset := range rawAssets { assets = append(assets, &models.URL{ Raw: rawAsset, - Hops: hops, + Hops: URL.GetHops() + 1, }) } diff --git a/internal/pkg/postprocessor/extractor/xml_test.go b/internal/pkg/postprocessor/extractor/xml_test.go index fa0e3d35..afd6499c 100644 --- a/internal/pkg/postprocessor/extractor/xml_test.go +++ b/internal/pkg/postprocessor/extractor/xml_test.go @@ -138,7 +138,7 @@ func TestXML(t *testing.T) { // Set the body in the URL URL.SetBody(bytes.NewReader(body.Bytes())) - assets, err := XML(URL, false) + assets, err := XML(URL) if (err != nil) != tt.hasError { t.Fatalf("XML() error = %v, wantErr %v", err, tt.hasError) } diff --git a/internal/pkg/postprocessor/outlinks.go b/internal/pkg/postprocessor/outlinks.go index c30cbb0f..f0b1e0be 100644 --- a/internal/pkg/postprocessor/outlinks.go +++ b/internal/pkg/postprocessor/outlinks.go @@ -28,7 +28,7 @@ func extractOutlinks(doc *goquery.Document, URL *models.URL, item *models.Item) return outlinks, err } case extractor.IsSitemapXML(URL): - outlinks, err = extractor.XML(URL, true) + outlinks, err = extractor.XML(URL) if err != nil { logger.Error("unable to extract outlinks", "err", err.Error(), "item", item.GetShortID()) return outlinks, err @@ -44,7 +44,7 @@ func extractOutlinks(doc *goquery.Document, URL *models.URL, item *models.Item) } // Try to extract links from link headers - linksFromLinkHeader := extractor.ExtractURLsFromHeader(URL.GetResponse().Header.Get("link")) + linksFromLinkHeader := extractor.ExtractURLsFromHeader(URL) if linksFromLinkHeader != nil { outlinks = append(outlinks, linksFromLinkHeader...) } From 801760bab8b8a1ef3fb2026a2627b7d3b9c6dfd4 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Sun, 12 Jan 2025 12:54:37 +0100 Subject: [PATCH 201/295] add: show item depth in archiver --- internal/pkg/archiver/archiver.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/internal/pkg/archiver/archiver.go b/internal/pkg/archiver/archiver.go index f096258d..714eb1a5 100644 --- a/internal/pkg/archiver/archiver.go +++ b/internal/pkg/archiver/archiver.go @@ -120,7 +120,7 @@ func run() { logger.Debug("received resume event") case item, ok := <-globalArchiver.inputCh: if ok { - logger.Debug("received item", "item", item.GetShortID()) + logger.Debug("received item", "item", item.GetShortID(), "depth", item.GetDepth()) guard <- struct{}{} wg.Add(1) stats.ArchiverRoutinesIncr() @@ -130,7 +130,7 @@ func run() { defer stats.ArchiverRoutinesDecr() if item.GetStatus() == models.ItemFailed || item.GetStatus() == models.ItemCompleted { - logger.Debug("skipping item", "item", item.GetShortID(), "status", item.GetStatus().String()) + logger.Debug("skipping item", "item", item.GetShortID(), "depth", item.GetDepth(), "status", item.GetStatus().String()) } else { err := item.CheckConsistency() if err != nil { @@ -142,7 +142,7 @@ func run() { select { case globalArchiver.outputCh <- item: case <-ctx.Done(): - logger.Debug("aborting item due to stop", "item", item.GetShortID()) + logger.Debug("aborting item due to stop", "item", item.GetShortID(), "depth", item.GetDepth()) return } }(ctx) @@ -169,13 +169,13 @@ func archive(item *models.Item) { items, err := item.GetNodesAtLevel(item.GetMaxDepth()) if err != nil { - logger.Error("unable to get nodes at level", "err", err.Error(), "item", item.GetShortID()) + logger.Error("unable to get nodes at level", "err", err.Error(), "item", item.GetShortID(), "depth", item.GetDepth()) panic(err) } for i := range items { if items[i].GetStatus() != models.ItemPreProcessed { - logger.Debug("skipping item", "item", items[i].GetShortID(), "status", items[i].GetStatus().String()) + logger.Debug("skipping item", "item", items[i].GetShortID(), "status", items[i].GetStatus().String(), "depth", items[i].GetDepth()) continue } @@ -215,7 +215,7 @@ func archive(item *models.Item) { body := bytes.NewBuffer(nil) _, err = io.Copy(body, resp.Body) if err != nil { - logger.Error("unable to read response body", "err", err.Error(), "item", item.GetShortID()) + logger.Error("unable to read response body", "err", err.Error(), "item", item.GetShortID(), "depth", item.GetDepth()) i.SetStatus(models.ItemFailed) return } @@ -225,7 +225,7 @@ func archive(item *models.Item) { stats.HTTPReturnCodesIncr(strconv.Itoa(resp.StatusCode)) - logger.Info("url archived", "url", i.GetURL().String(), "item", item.GetShortID(), "status", resp.StatusCode) + logger.Info("url archived", "url", i.GetURL().String(), "depth", item.GetDepth(), "item", item.GetShortID(), "status", resp.StatusCode) i.SetStatus(models.ItemArchived) }(items[i]) From 4f889c73494a260d2b06f0b34c7f6d81eb9157d3 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Sun, 12 Jan 2025 12:59:55 +0100 Subject: [PATCH 202/295] add: show if an item is a seed in archiver's logs --- internal/pkg/archiver/archiver.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/internal/pkg/archiver/archiver.go b/internal/pkg/archiver/archiver.go index 714eb1a5..0395200c 100644 --- a/internal/pkg/archiver/archiver.go +++ b/internal/pkg/archiver/archiver.go @@ -120,7 +120,7 @@ func run() { logger.Debug("received resume event") case item, ok := <-globalArchiver.inputCh: if ok { - logger.Debug("received item", "item", item.GetShortID(), "depth", item.GetDepth()) + logger.Debug("received item", "item", item.GetShortID(), "depth", item.GetDepth(), "is_seed", item.IsSeed()) guard <- struct{}{} wg.Add(1) stats.ArchiverRoutinesIncr() @@ -130,7 +130,7 @@ func run() { defer stats.ArchiverRoutinesDecr() if item.GetStatus() == models.ItemFailed || item.GetStatus() == models.ItemCompleted { - logger.Debug("skipping item", "item", item.GetShortID(), "depth", item.GetDepth(), "status", item.GetStatus().String()) + logger.Debug("skipping item", "item", item.GetShortID(), "depth", item.GetDepth(), "is_seed", item.IsSeed(), "status", item.GetStatus().String()) } else { err := item.CheckConsistency() if err != nil { @@ -142,7 +142,7 @@ func run() { select { case globalArchiver.outputCh <- item: case <-ctx.Done(): - logger.Debug("aborting item due to stop", "item", item.GetShortID(), "depth", item.GetDepth()) + logger.Debug("aborting item due to stop", "item", item.GetShortID(), "depth", item.GetDepth(), "is_seed", item.IsSeed()) return } }(ctx) @@ -169,7 +169,7 @@ func archive(item *models.Item) { items, err := item.GetNodesAtLevel(item.GetMaxDepth()) if err != nil { - logger.Error("unable to get nodes at level", "err", err.Error(), "item", item.GetShortID(), "depth", item.GetDepth()) + logger.Error("unable to get nodes at level", "err", err.Error(), "item", item.GetShortID(), "depth", item.GetDepth(), "is_seed", item.IsSeed()) panic(err) } @@ -215,7 +215,7 @@ func archive(item *models.Item) { body := bytes.NewBuffer(nil) _, err = io.Copy(body, resp.Body) if err != nil { - logger.Error("unable to read response body", "err", err.Error(), "item", item.GetShortID(), "depth", item.GetDepth()) + logger.Error("unable to read response body", "err", err.Error(), "item", item.GetShortID(), "depth", item.GetDepth(), "is_seed", item.IsSeed()) i.SetStatus(models.ItemFailed) return } @@ -225,7 +225,7 @@ func archive(item *models.Item) { stats.HTTPReturnCodesIncr(strconv.Itoa(resp.StatusCode)) - logger.Info("url archived", "url", i.GetURL().String(), "depth", item.GetDepth(), "item", item.GetShortID(), "status", resp.StatusCode) + logger.Info("url archived", "url", i.GetURL().String(), "depth", item.GetDepth(), "is_seed", item.IsSeed(), "item", item.GetShortID(), "status", resp.StatusCode) i.SetStatus(models.ItemArchived) }(items[i]) From c9e4c3d4b6d2ae3a4420b5e675811fce66dbd568 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Sun, 12 Jan 2025 13:19:57 +0100 Subject: [PATCH 203/295] =?UTF-8?q?chore:=20adding=20some=20"clarit=C3=A9"?= =?UTF-8?q?=20to=20the=20archive()=20function=20variable=20names?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- internal/pkg/archiver/archiver.go | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/internal/pkg/archiver/archiver.go b/internal/pkg/archiver/archiver.go index 0395200c..b9979bb5 100644 --- a/internal/pkg/archiver/archiver.go +++ b/internal/pkg/archiver/archiver.go @@ -156,7 +156,7 @@ func run() { } } -func archive(item *models.Item) { +func archive(seed *models.Item) { // TODO: rate limiting handling logger := log.NewFieldedLogger(&log.Fields{ "component": "archiver.archive", @@ -167,22 +167,22 @@ func archive(item *models.Item) { wg sync.WaitGroup ) - items, err := item.GetNodesAtLevel(item.GetMaxDepth()) + items, err := seed.GetNodesAtLevel(seed.GetMaxDepth()) if err != nil { - logger.Error("unable to get nodes at level", "err", err.Error(), "item", item.GetShortID(), "depth", item.GetDepth(), "is_seed", item.IsSeed()) + logger.Error("unable to get nodes at level", "err", err.Error(), "seed_id", seed.GetShortID()) panic(err) } for i := range items { if items[i].GetStatus() != models.ItemPreProcessed { - logger.Debug("skipping item", "item", items[i].GetShortID(), "status", items[i].GetStatus().String(), "depth", items[i].GetDepth()) + logger.Debug("skipping item", "seed_id", seed.GetShortID(), "item_id", items[i].GetShortID(), "status", items[i].GetStatus().String(), "depth", items[i].GetDepth()) continue } guard <- struct{}{} wg.Add(1) - go func(i *models.Item) { + go func(item *models.Item) { defer wg.Done() defer func() { <-guard }() defer stats.URLsCrawledIncr() @@ -193,7 +193,7 @@ func archive(item *models.Item) { ) // Execute the request - req := i.GetURL().GetRequest() + req := item.GetURL().GetRequest() if req == nil { panic("request is nil") } @@ -203,31 +203,31 @@ func archive(item *models.Item) { resp, err = globalArchiver.Client.Do(req) } if err != nil { - logger.Error("unable to execute request", "err", err.Error()) - i.SetStatus(models.ItemFailed) + logger.Error("unable to execute request", "err", err.Error(), "seed_id", seed.GetShortID(), "item_id", item.GetShortID(), "depth", item.GetDepth(), "is_seed", item.IsSeed()) + item.SetStatus(models.ItemFailed) return } // Set the response in the URL - i.GetURL().SetResponse(resp) + item.GetURL().SetResponse(resp) // Consume the response body body := bytes.NewBuffer(nil) _, err = io.Copy(body, resp.Body) if err != nil { - logger.Error("unable to read response body", "err", err.Error(), "item", item.GetShortID(), "depth", item.GetDepth(), "is_seed", item.IsSeed()) - i.SetStatus(models.ItemFailed) + logger.Error("unable to read response body", "err", err.Error(), "seed_id", seed.GetShortID(), "item_id", item.GetShortID(), "depth", item.GetDepth(), "is_seed", item.IsSeed()) + item.SetStatus(models.ItemFailed) return } // Set the body in the URL - i.GetURL().SetBody(bytes.NewReader(body.Bytes())) + item.GetURL().SetBody(bytes.NewReader(body.Bytes())) stats.HTTPReturnCodesIncr(strconv.Itoa(resp.StatusCode)) - logger.Info("url archived", "url", i.GetURL().String(), "depth", item.GetDepth(), "is_seed", item.IsSeed(), "item", item.GetShortID(), "status", resp.StatusCode) + logger.Info("url archived", "url", item.GetURL().String(), "seed_id", seed.GetShortID(), "item_id", item.GetShortID(), "depth", item.GetDepth(), "is_seed", item.IsSeed()) - i.SetStatus(models.ItemArchived) + item.SetStatus(models.ItemArchived) }(items[i]) } From 3313b6693d77567fa130efbc10ab3916eadd94e9 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Sun, 12 Jan 2025 13:50:55 +0100 Subject: [PATCH 204/295] chore: log hops in archive() --- internal/pkg/archiver/archiver.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/internal/pkg/archiver/archiver.go b/internal/pkg/archiver/archiver.go index b9979bb5..dbf75d64 100644 --- a/internal/pkg/archiver/archiver.go +++ b/internal/pkg/archiver/archiver.go @@ -120,7 +120,7 @@ func run() { logger.Debug("received resume event") case item, ok := <-globalArchiver.inputCh: if ok { - logger.Debug("received item", "item", item.GetShortID(), "depth", item.GetDepth(), "is_seed", item.IsSeed()) + logger.Debug("received item", "item", item.GetShortID(), "depth", item.GetDepth(), "hops", item.GetURL().GetHops()) guard <- struct{}{} wg.Add(1) stats.ArchiverRoutinesIncr() @@ -130,7 +130,7 @@ func run() { defer stats.ArchiverRoutinesDecr() if item.GetStatus() == models.ItemFailed || item.GetStatus() == models.ItemCompleted { - logger.Debug("skipping item", "item", item.GetShortID(), "depth", item.GetDepth(), "is_seed", item.IsSeed(), "status", item.GetStatus().String()) + logger.Debug("skipping item", "item", item.GetShortID(), "depth", item.GetDepth(), "hops", item.GetURL().GetHops(), "status", item.GetStatus().String()) } else { err := item.CheckConsistency() if err != nil { @@ -142,7 +142,7 @@ func run() { select { case globalArchiver.outputCh <- item: case <-ctx.Done(): - logger.Debug("aborting item due to stop", "item", item.GetShortID(), "depth", item.GetDepth(), "is_seed", item.IsSeed()) + logger.Debug("aborting item due to stop", "item", item.GetShortID(), "depth", item.GetDepth(), "hops", item.GetURL().GetHops()) return } }(ctx) @@ -203,7 +203,7 @@ func archive(seed *models.Item) { resp, err = globalArchiver.Client.Do(req) } if err != nil { - logger.Error("unable to execute request", "err", err.Error(), "seed_id", seed.GetShortID(), "item_id", item.GetShortID(), "depth", item.GetDepth(), "is_seed", item.IsSeed()) + logger.Error("unable to execute request", "err", err.Error(), "seed_id", seed.GetShortID(), "item_id", item.GetShortID(), "depth", item.GetDepth(), "hops", item.GetURL().GetHops()) item.SetStatus(models.ItemFailed) return } @@ -215,7 +215,7 @@ func archive(seed *models.Item) { body := bytes.NewBuffer(nil) _, err = io.Copy(body, resp.Body) if err != nil { - logger.Error("unable to read response body", "err", err.Error(), "seed_id", seed.GetShortID(), "item_id", item.GetShortID(), "depth", item.GetDepth(), "is_seed", item.IsSeed()) + logger.Error("unable to read response body", "err", err.Error(), "seed_id", seed.GetShortID(), "item_id", item.GetShortID(), "depth", item.GetDepth(), "hops", item.GetURL().GetHops()) item.SetStatus(models.ItemFailed) return } @@ -225,7 +225,7 @@ func archive(seed *models.Item) { stats.HTTPReturnCodesIncr(strconv.Itoa(resp.StatusCode)) - logger.Info("url archived", "url", item.GetURL().String(), "seed_id", seed.GetShortID(), "item_id", item.GetShortID(), "depth", item.GetDepth(), "is_seed", item.IsSeed()) + logger.Info("url archived", "url", item.GetURL().String(), "seed_id", seed.GetShortID(), "item_id", item.GetShortID(), "depth", item.GetDepth(), "hops", item.GetURL().GetHops()) item.SetStatus(models.ItemArchived) }(items[i]) From f542dab04878ddff26a6fac107b94550c9dbc70c Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Sun, 12 Jan 2025 14:07:19 +0100 Subject: [PATCH 205/295] fix: max hops stop logic in postprocess() --- internal/pkg/postprocessor/postprocessor.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/internal/pkg/postprocessor/postprocessor.go b/internal/pkg/postprocessor/postprocessor.go index 791f2c3d..c0e7b902 100644 --- a/internal/pkg/postprocessor/postprocessor.go +++ b/internal/pkg/postprocessor/postprocessor.go @@ -198,7 +198,9 @@ func postprocess(item *models.Item) (outlinks []*models.Item) { // - the item is a child and the URL has more than one hop // - assets capture is disabled and domains crawl is disabled // - the URL has more hops than the max allowed - if (items[i].IsChild() && items[i].GetURL().GetHops() > 1) || config.Get().DisableAssetsCapture && !config.Get().DomainsCrawl && (uint64(config.Get().MaxHops) <= uint64(items[i].GetURL().GetHops())) { + if (items[i].IsChild() && items[i].GetURL().GetHops() > 1) || + (config.Get().DisableAssetsCapture && !config.Get().DomainsCrawl) || + (config.Get().MaxHops <= items[i].GetURL().GetHops()) { items[i].SetStatus(models.ItemCompleted) continue } From 059e191f2a2e74286a231c1c9f326b8b6cdba620 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Sun, 12 Jan 2025 14:21:31 +0100 Subject: [PATCH 206/295] fix: possible infinite loop on Facebook site-specific post-processing --- internal/pkg/postprocessor/sitespecific/facebook/facebook.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/internal/pkg/postprocessor/sitespecific/facebook/facebook.go b/internal/pkg/postprocessor/sitespecific/facebook/facebook.go index a55c81de..ad0f4003 100644 --- a/internal/pkg/postprocessor/sitespecific/facebook/facebook.go +++ b/internal/pkg/postprocessor/sitespecific/facebook/facebook.go @@ -9,7 +9,9 @@ import ( ) func IsFacebookPostURL(URL *models.URL) bool { - return strings.Contains(URL.String(), "facebook.com") && strings.Contains(URL.String(), "/posts/") + return strings.Contains(URL.String(), "facebook.com") && + strings.Contains(URL.String(), "/posts/") && + !strings.Contains(URL.String(), "/plugins/post.php") } func GenerateEmbedURL(URL *models.URL) *models.URL { From d900837631fb9acbf532d9685f99d10edd79971d Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Sun, 12 Jan 2025 14:22:16 +0100 Subject: [PATCH 207/295] fix: avid graph.facebook.com being triggered by Facebook site-specific post-processing --- internal/pkg/postprocessor/sitespecific/facebook/facebook.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/pkg/postprocessor/sitespecific/facebook/facebook.go b/internal/pkg/postprocessor/sitespecific/facebook/facebook.go index ad0f4003..2bc26bb5 100644 --- a/internal/pkg/postprocessor/sitespecific/facebook/facebook.go +++ b/internal/pkg/postprocessor/sitespecific/facebook/facebook.go @@ -9,7 +9,7 @@ import ( ) func IsFacebookPostURL(URL *models.URL) bool { - return strings.Contains(URL.String(), "facebook.com") && + return strings.Contains(URL.String(), "www.facebook.com") && strings.Contains(URL.String(), "/posts/") && !strings.Contains(URL.String(), "/plugins/post.php") } From 0d08a4a56a045d2694877de6a71edfcac93a7f4b Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Sun, 12 Jan 2025 14:27:10 +0100 Subject: [PATCH 208/295] fix: Facebook site-specific post-processing creating item as seed instead of child --- internal/pkg/postprocessor/postprocessor.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/pkg/postprocessor/postprocessor.go b/internal/pkg/postprocessor/postprocessor.go index c0e7b902..b3038c19 100644 --- a/internal/pkg/postprocessor/postprocessor.go +++ b/internal/pkg/postprocessor/postprocessor.go @@ -187,7 +187,7 @@ func postprocess(item *models.Item) (outlinks []*models.Item) { uuid.New().String(), facebook.GenerateEmbedURL(items[i].GetURL()), items[i].GetURL().String(), - true, + false, ), models.ItemGotChildren) if err != nil { panic(err) From 25dfee2afd06b85d3856faecdb5785b7e652cd77 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Sun, 12 Jan 2025 14:49:55 +0100 Subject: [PATCH 209/295] chore: log item ID in case of preprocessor panic --- internal/pkg/preprocessor/preprocessor.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go index 9c13c909..21ab2c2e 100644 --- a/internal/pkg/preprocessor/preprocessor.go +++ b/internal/pkg/preprocessor/preprocessor.go @@ -112,7 +112,7 @@ func run() { defer stats.PreprocessorRoutinesDecr() if item.GetStatus() == models.ItemFailed || item.GetStatus() == models.ItemCompleted { - panic(fmt.Sprintf("preprocessor received item with status %d", item.GetStatus())) + panic(fmt.Sprintf("preprocessor received item with status %d, item id: %s", item.GetStatus(), item.GetShortID())) } preprocess(item) From 3b358eb326f93aa6f72e1e9b0418dff119bd0f10 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Sun, 12 Jan 2025 14:54:47 +0100 Subject: [PATCH 210/295] fix: Facebook site-specific post-processing not updating item's status properly --- internal/pkg/postprocessor/postprocessor.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/internal/pkg/postprocessor/postprocessor.go b/internal/pkg/postprocessor/postprocessor.go index b3038c19..bb9ba146 100644 --- a/internal/pkg/postprocessor/postprocessor.go +++ b/internal/pkg/postprocessor/postprocessor.go @@ -192,6 +192,8 @@ func postprocess(item *models.Item) (outlinks []*models.Item) { if err != nil { panic(err) } + + items[i].SetStatus(models.ItemGotChildren) } // Return if: From 0fc21277cae3494fd0eb0a1eeca4f094b090f219 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Sun, 12 Jan 2025 14:56:05 +0100 Subject: [PATCH 211/295] remove: Facebook site-specific code until its fixed --- internal/pkg/postprocessor/postprocessor.go | 33 +++++++++++---------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/internal/pkg/postprocessor/postprocessor.go b/internal/pkg/postprocessor/postprocessor.go index bb9ba146..8408a75e 100644 --- a/internal/pkg/postprocessor/postprocessor.go +++ b/internal/pkg/postprocessor/postprocessor.go @@ -9,7 +9,6 @@ import ( "github.com/internetarchive/Zeno/internal/pkg/config" "github.com/internetarchive/Zeno/internal/pkg/controler/pause" "github.com/internetarchive/Zeno/internal/pkg/log" - "github.com/internetarchive/Zeno/internal/pkg/postprocessor/sitespecific/facebook" "github.com/internetarchive/Zeno/internal/pkg/stats" "github.com/internetarchive/Zeno/pkg/models" ) @@ -180,21 +179,23 @@ func postprocess(item *models.Item) (outlinks []*models.Item) { } // Execute site-specific post-processing - switch { - case facebook.IsFacebookPostURL(items[i].GetURL()): - err := items[i].AddChild( - models.NewItem( - uuid.New().String(), - facebook.GenerateEmbedURL(items[i].GetURL()), - items[i].GetURL().String(), - false, - ), models.ItemGotChildren) - if err != nil { - panic(err) - } - - items[i].SetStatus(models.ItemGotChildren) - } + // TODO: re-add, but it was causing: + // panic: preprocessor received item with status 4 + // switch { + // case facebook.IsFacebookPostURL(items[i].GetURL()): + // err := items[i].AddChild( + // models.NewItem( + // uuid.New().String(), + // facebook.GenerateEmbedURL(items[i].GetURL()), + // items[i].GetURL().String(), + // false, + // ), models.ItemGotChildren) + // if err != nil { + // panic(err) + // } + + // items[i].SetStatus(models.ItemGotChildren) + // } // Return if: // - the item is a child and the URL has more than one hop From d8cf2d1a4e652e0e84f36d8d295d7f1aa69f2ed4 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Sun, 12 Jan 2025 14:58:25 +0100 Subject: [PATCH 212/295] fix: max hops stop logic in postprocess() --- internal/pkg/postprocessor/postprocessor.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/pkg/postprocessor/postprocessor.go b/internal/pkg/postprocessor/postprocessor.go index 8408a75e..b06af737 100644 --- a/internal/pkg/postprocessor/postprocessor.go +++ b/internal/pkg/postprocessor/postprocessor.go @@ -203,7 +203,7 @@ func postprocess(item *models.Item) (outlinks []*models.Item) { // - the URL has more hops than the max allowed if (items[i].IsChild() && items[i].GetURL().GetHops() > 1) || (config.Get().DisableAssetsCapture && !config.Get().DomainsCrawl) || - (config.Get().MaxHops <= items[i].GetURL().GetHops()) { + (items[i].GetURL().GetHops() >= config.Get().MaxHops) { items[i].SetStatus(models.ItemCompleted) continue } From 73d56e5ee3adfc0b92a651aa355f85091bc259da Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Sun, 12 Jan 2025 15:01:56 +0100 Subject: [PATCH 213/295] fix: max hops stop logic in postprocess() --- internal/pkg/postprocessor/postprocessor.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/pkg/postprocessor/postprocessor.go b/internal/pkg/postprocessor/postprocessor.go index b06af737..108bee06 100644 --- a/internal/pkg/postprocessor/postprocessor.go +++ b/internal/pkg/postprocessor/postprocessor.go @@ -200,10 +200,10 @@ func postprocess(item *models.Item) (outlinks []*models.Item) { // Return if: // - the item is a child and the URL has more than one hop // - assets capture is disabled and domains crawl is disabled - // - the URL has more hops than the max allowed + // - the item is a seed and the URL has more hops than the max allowed if (items[i].IsChild() && items[i].GetURL().GetHops() > 1) || (config.Get().DisableAssetsCapture && !config.Get().DomainsCrawl) || - (items[i].GetURL().GetHops() >= config.Get().MaxHops) { + (items[i].IsSeed() && (items[i].GetURL().GetHops() >= config.Get().MaxHops)) { items[i].SetStatus(models.ItemCompleted) continue } From e0541a9fcfc71246a04ecabc50c66ee37273bae5 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Sun, 12 Jan 2025 17:25:41 +0100 Subject: [PATCH 214/295] fix: close resp.Body after reading it in archive() --- internal/pkg/archiver/archiver.go | 1 + 1 file changed, 1 insertion(+) diff --git a/internal/pkg/archiver/archiver.go b/internal/pkg/archiver/archiver.go index dbf75d64..8d541698 100644 --- a/internal/pkg/archiver/archiver.go +++ b/internal/pkg/archiver/archiver.go @@ -207,6 +207,7 @@ func archive(seed *models.Item) { item.SetStatus(models.ItemFailed) return } + defer resp.Body.Close() // Set the response in the URL item.GetURL().SetResponse(resp) From ec4b543fdffe320e9d4b7767fc40c6201adc8298 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Sun, 12 Jan 2025 17:38:42 +0100 Subject: [PATCH 215/295] stats: correct the getTotal() implementation for stats.rate --- internal/pkg/stats/rate.go | 30 ++++++++++++++------------- internal/pkg/stats/rate_test.go | 36 ++++++++++++++++++++------------- 2 files changed, 38 insertions(+), 28 deletions(-) diff --git a/internal/pkg/stats/rate.go b/internal/pkg/stats/rate.go index ed969f55..fa2790e5 100644 --- a/internal/pkg/stats/rate.go +++ b/internal/pkg/stats/rate.go @@ -6,37 +6,39 @@ import ( ) type rate struct { - count uint64 - lastCount uint64 - lastUpdate int64 + total atomic.Uint64 + count atomic.Uint64 + lastCount atomic.Uint64 + lastUpdate atomic.Int64 } func (rps *rate) incr(step uint64) { - atomic.AddUint64(&rps.count, step) + rps.count.Add(step) + rps.total.Add(step) } func (rps *rate) get() uint64 { now := time.Now().Unix() - lastUpdate := atomic.LoadInt64(&rps.lastUpdate) + lastUpdate := rps.lastUpdate.Load() if now == lastUpdate { - return atomic.LoadUint64(&rps.lastCount) + return rps.lastCount.Load() } - currentCount := atomic.LoadUint64(&rps.count) - lastCount := atomic.SwapUint64(&rps.count, 0) - atomic.StoreUint64(&rps.lastCount, lastCount) - atomic.StoreInt64(&rps.lastUpdate, now) + currentCount := rps.count.Load() + lastCount := rps.count.Swap(0) + rps.lastCount.Store(lastCount) + rps.lastUpdate.Store(now) return currentCount } func (rps *rate) getTotal() uint64 { - return atomic.LoadUint64(&rps.count) + return rps.total.Load() } func (rps *rate) reset() { - atomic.StoreUint64(&rps.count, 0) - atomic.StoreUint64(&rps.lastCount, 0) - atomic.StoreInt64(&rps.lastUpdate, 0) + rps.count.Store(0) + rps.lastCount.Store(0) + rps.lastUpdate.Store(0) } diff --git a/internal/pkg/stats/rate_test.go b/internal/pkg/stats/rate_test.go index e5424e7c..32d003ba 100644 --- a/internal/pkg/stats/rate_test.go +++ b/internal/pkg/stats/rate_test.go @@ -1,7 +1,6 @@ package stats import ( - "sync/atomic" "testing" "time" ) @@ -16,8 +15,9 @@ func TestRate_Start(t *testing.T) { time.Sleep(1100 * time.Millisecond) // Check if the rate per second is correctly updated - if rate.get() != 5 { - t.Errorf("expected rate per second to be 5, got %d", rate.get()) + gotRate := rate.get() + if gotRate != 5 { + t.Errorf("expected rate per second to be 5, got %d", gotRate) } // Increment the rate counter again @@ -27,8 +27,9 @@ func TestRate_Start(t *testing.T) { time.Sleep(1100 * time.Millisecond) // Check if the rate per second is correctly updated - if rate.get() != 10 { - t.Errorf("expected rate per second to be 10, got %d", rate.get()) + gotRate = rate.get() + if gotRate != 10 { + t.Errorf("expected rate per second to be 10, got %d", gotRate) } // Increment the rate counter multiple times and check the rate over several seconds @@ -36,8 +37,9 @@ func TestRate_Start(t *testing.T) { rate.incr(2) time.Sleep(1100 * time.Millisecond) expectedRate := uint64(2) - if rate.get() != expectedRate { - t.Errorf("expected rate per second to be %d, got %d", expectedRate, rate.get()) + gotRate = rate.get() + if gotRate != expectedRate { + t.Errorf("expected rate per second to be %d, got %d", expectedRate, gotRate) } } } @@ -49,16 +51,16 @@ func TestRate_Incr(t *testing.T) { rate.incr(3) // Check if the count is correctly incremented - if atomic.LoadUint64(&rate.count) != 3 { - t.Errorf("expected count to be 3, got %d", atomic.LoadUint64(&rate.count)) + if rate.count.Load() != 3 { + t.Errorf("expected count to be 3, got %d", rate.count.Load()) } // Increment the rate counter again rate.incr(2) // Check if the count is correctly incremented - if atomic.LoadUint64(&rate.count) != 5 { - t.Errorf("expected count to be 5, got %d", atomic.LoadUint64(&rate.count)) + if rate.count.Load() != 5 { + t.Errorf("expected count to be 5, got %d", rate.count.Load()) } } @@ -80,11 +82,17 @@ func TestRate_Get(t *testing.T) { func TestRate_GetTotal(t *testing.T) { rate := &rate{} - // Increment the rate counter + // Fuzz the rate counter rate.incr(7) + time.Sleep(1 * time.Second) + rate.get() + rate.incr(3) + time.Sleep(1 * time.Second) + rate.get() + rate.incr(0) // Check if the total count is correctly retrieved - if rate.getTotal() != 7 { - t.Errorf("expected total count to be 7, got %d", rate.getTotal()) + if rate.getTotal() != 10 { + t.Errorf("expected total count to be 10, got %d", rate.getTotal()) } } From 17673f608c18b1d93a3c56f20e114e6792b70a95 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Sun, 12 Jan 2025 19:27:16 +0100 Subject: [PATCH 216/295] optimize: memory usage of response body when being post-processed --- go.mod | 13 +- go.sum | 26 ++-- internal/pkg/archiver/archiver.go | 17 +-- internal/pkg/postprocessor/assets.go | 27 ++-- internal/pkg/postprocessor/base.go | 4 +- internal/pkg/postprocessor/extractor/html.go | 99 +++++++------ internal/pkg/postprocessor/outlinks.go | 21 ++- internal/pkg/postprocessor/postprocess.go | 139 +++++++++++++++++++ internal/pkg/postprocessor/postprocessor.go | 132 ++---------------- pkg/models/url.go | 58 ++++++++ 10 files changed, 308 insertions(+), 228 deletions(-) create mode 100644 internal/pkg/postprocessor/postprocess.go diff --git a/go.mod b/go.mod index 8bc0c0ed..5e57b91d 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,7 @@ require ( github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc github.com/elastic/go-elasticsearch v0.0.0 github.com/elastic/go-elasticsearch/v7 v7.17.10 + github.com/gabriel-vasile/mimetype v1.4.8 github.com/gdamore/tcell/v2 v2.7.1 github.com/google/uuid v1.6.0 github.com/grafana/pyroscope-go v1.2.0 @@ -20,7 +21,7 @@ require ( github.com/spf13/pflag v1.0.5 github.com/spf13/viper v1.19.0 go.uber.org/goleak v1.3.0 - golang.org/x/net v0.31.0 + golang.org/x/net v0.33.0 mvdan.cc/xurls/v2 v2.5.0 ) @@ -59,13 +60,13 @@ require ( github.com/syndtr/goleveldb v1.0.0 // indirect github.com/ulikunitz/xz v0.5.12 // indirect go.uber.org/multierr v1.11.0 // indirect - golang.org/x/crypto v0.29.0 // indirect + golang.org/x/crypto v0.31.0 // indirect golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 // indirect golang.org/x/mod v0.21.0 // indirect - golang.org/x/sync v0.9.0 // indirect - golang.org/x/sys v0.27.0 // indirect - golang.org/x/term v0.26.0 // indirect - golang.org/x/text v0.20.0 // indirect + golang.org/x/sync v0.10.0 // indirect + golang.org/x/sys v0.28.0 // indirect + golang.org/x/term v0.27.0 // indirect + golang.org/x/text v0.21.0 // indirect golang.org/x/tools v0.25.0 // indirect gopkg.in/ini.v1 v1.67.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/go.sum b/go.sum index efff8a44..99dce9df 100644 --- a/go.sum +++ b/go.sum @@ -25,6 +25,8 @@ github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7z github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= +github.com/gabriel-vasile/mimetype v1.4.8 h1:FfZ3gj38NjllZIeJAmMhr+qKL8Wu+nOoI3GqacKw1NM= +github.com/gabriel-vasile/mimetype v1.4.8/go.mod h1:ByKUIKGjh1ODkGM1asKUbQZOLGrPjydw3hYPU2YU9t8= github.com/gdamore/encoding v1.0.0 h1:+7OoQ1Bc6eTm5niUzBa0Ctsh6JbMW6Ra+YNuAtDBdko= github.com/gdamore/encoding v1.0.0/go.mod h1:alR0ol34c49FCSBLjhosxzcPHQbf2trDkoo5dl+VrEg= github.com/gdamore/tcell/v2 v2.7.1 h1:TiCcmpWHiAU7F0rA2I3S2Y4mmLmO9KHxJ7E1QhYzQbc= @@ -141,8 +143,8 @@ go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.29.0 h1:L5SG1JTTXupVV3n6sUqMTeWbjAyfPwoda2DLX8J8FrQ= -golang.org/x/crypto v0.29.0/go.mod h1:+F4F4N5hv6v38hfeYwTdx20oUvLLc+QfrE9Ax9HtgRg= +golang.org/x/crypto v0.31.0 h1:ihbySMvVjLAeSH1IbfcRTkD/iNscyz8rGzjF/E5hV6U= +golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 h1:e66Fs6Z+fZTbFBAxKfP3PALWBtpfqks2bwGcexMxgtk= golang.org/x/exp v0.0.0-20240909161429-701f63a606c0/go.mod h1:2TbTHSBQa924w8M6Xs1QcRcFwyucIwBGpK1p2f1YFFY= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= @@ -155,14 +157,14 @@ golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= -golang.org/x/net v0.31.0 h1:68CPQngjLL0r2AlUKiSxtQFKvzRVbnzLwMUn5SzcLHo= -golang.org/x/net v0.31.0/go.mod h1:P4fl1q7dY2hnZFxEk4pPSkDHF+QqjitcnDjUQyMM+pM= +golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I= +golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.9.0 h1:fEo0HyrW1GIgZdpbhCRO0PkJajUS5H9IFUztCgEo2jQ= -golang.org/x/sync v0.9.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= +golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -173,23 +175,23 @@ golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s= -golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA= +golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= -golang.org/x/term v0.26.0 h1:WEQa6V3Gja/BhNxg540hBip/kkaYtRg3cxg4oXSw4AU= -golang.org/x/term v0.26.0/go.mod h1:Si5m1o57C5nBNQo5z1iq+XDijt21BDBDp2bK0QI8e3E= +golang.org/x/term v0.27.0 h1:WP60Sv1nlK1T6SupCHbXzSaN0b9wUmsPoRS9b61A23Q= +golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= -golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug= -golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4= +golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= +golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= diff --git a/internal/pkg/archiver/archiver.go b/internal/pkg/archiver/archiver.go index 8d541698..86394fd8 100644 --- a/internal/pkg/archiver/archiver.go +++ b/internal/pkg/archiver/archiver.go @@ -1,9 +1,7 @@ package archiver import ( - "bytes" "context" - "io" "net/http" "strconv" "sync" @@ -207,26 +205,13 @@ func archive(seed *models.Item) { item.SetStatus(models.ItemFailed) return } - defer resp.Body.Close() // Set the response in the URL item.GetURL().SetResponse(resp) - // Consume the response body - body := bytes.NewBuffer(nil) - _, err = io.Copy(body, resp.Body) - if err != nil { - logger.Error("unable to read response body", "err", err.Error(), "seed_id", seed.GetShortID(), "item_id", item.GetShortID(), "depth", item.GetDepth(), "hops", item.GetURL().GetHops()) - item.SetStatus(models.ItemFailed) - return - } - - // Set the body in the URL - item.GetURL().SetBody(bytes.NewReader(body.Bytes())) - stats.HTTPReturnCodesIncr(strconv.Itoa(resp.StatusCode)) - logger.Info("url archived", "url", item.GetURL().String(), "seed_id", seed.GetShortID(), "item_id", item.GetShortID(), "depth", item.GetDepth(), "hops", item.GetURL().GetHops()) + logger.Info("url archived", "url", item.GetURL().String(), "seed_id", seed.GetShortID(), "item_id", item.GetShortID(), "depth", item.GetDepth(), "hops", item.GetURL().GetHops(), "status", resp.StatusCode) item.SetStatus(models.ItemArchived) }(items[i]) diff --git a/internal/pkg/postprocessor/assets.go b/internal/pkg/postprocessor/assets.go index 2199a843..ec686469 100644 --- a/internal/pkg/postprocessor/assets.go +++ b/internal/pkg/postprocessor/assets.go @@ -1,16 +1,15 @@ package postprocessor import ( - "github.com/PuerkitoBio/goquery" "github.com/internetarchive/Zeno/internal/pkg/log" "github.com/internetarchive/Zeno/internal/pkg/postprocessor/extractor" "github.com/internetarchive/Zeno/internal/pkg/postprocessor/sitespecific/ina" "github.com/internetarchive/Zeno/pkg/models" ) -func extractAssets(doc *goquery.Document, URL *models.URL, item *models.Item) (assets []*models.URL, err error) { +func extractAssets(item *models.Item) (assets []*models.URL, err error) { var ( - contentType = URL.GetResponse().Header.Get("Content-Type") + contentType = item.GetURL().GetResponse().Header.Get("Content-Type") logger = log.NewFieldedLogger(&log.Fields{ "component": "postprocessor.extractAssets", }) @@ -20,32 +19,32 @@ func extractAssets(doc *goquery.Document, URL *models.URL, item *models.Item) (a switch { // Order is important, we want to check for more specific things first, // as they may trigger more general extractors (e.g. HTML) - case ina.IsAPIURL(URL): - assets, err := ina.ExtractMedias(URL) + case ina.IsAPIURL(item.GetURL()): + assets, err := ina.ExtractMedias(item.GetURL()) if err != nil { logger.Error("unable to extract medias from INA", "err", err.Error(), "item", item.GetShortID()) return assets, err } - case extractor.IsM3U8(URL): - assets, err = extractor.M3U8(URL) + case extractor.IsM3U8(item.GetURL()): + assets, err = extractor.M3U8(item.GetURL()) if err != nil { logger.Error("unable to extract assets", "err", err.Error(), "item", item.GetShortID()) return assets, err } - case extractor.IsJSON(URL): - assets, err = extractor.JSON(URL) + case extractor.IsJSON(item.GetURL()): + assets, err = extractor.JSON(item.GetURL()) if err != nil { logger.Error("unable to extract assets", "err", err.Error(), "item", item.GetShortID()) return assets, err } - case extractor.IsXML(URL): - assets, err = extractor.XML(URL) + case extractor.IsXML(item.GetURL()): + assets, err = extractor.XML(item.GetURL()) if err != nil { logger.Error("unable to extract assets", "err", err.Error(), "item", item.GetShortID()) return assets, err } - case extractor.IsHTML(URL): - assets, err = extractor.HTMLAssets(doc, URL, item) + case extractor.IsHTML(item.GetURL()): + assets, err = extractor.HTMLAssets(item) if err != nil { logger.Error("unable to extract assets", "err", err.Error(), "item", item.GetShortID()) return assets, err @@ -54,5 +53,7 @@ func extractAssets(doc *goquery.Document, URL *models.URL, item *models.Item) (a logger.Debug("no extractor used for page", "content-type", contentType, "item", item.GetShortID()) } + logger.Info("extracted assets", "item", item.GetShortID(), "assets", len(assets)) + return } diff --git a/internal/pkg/postprocessor/base.go b/internal/pkg/postprocessor/base.go index ae57324b..96e5044b 100644 --- a/internal/pkg/postprocessor/base.go +++ b/internal/pkg/postprocessor/base.go @@ -5,8 +5,8 @@ import ( "github.com/internetarchive/Zeno/pkg/models" ) -func scrapeBaseTag(doc *goquery.Document, item *models.Item) { - doc.Find("base").Each(func(index int, base *goquery.Selection) { +func scrapeBaseTag(item *models.Item) { + item.GetURL().GetDocument().Find("base").Each(func(index int, base *goquery.Selection) { href, exists := base.Attr("href") if exists { item.SetBase(href) diff --git a/internal/pkg/postprocessor/extractor/html.go b/internal/pkg/postprocessor/extractor/html.go index 31ae5409..ca7fb5de 100644 --- a/internal/pkg/postprocessor/extractor/html.go +++ b/internal/pkg/postprocessor/extractor/html.go @@ -21,7 +21,7 @@ func IsHTML(URL *models.URL) bool { return isContentType(URL.GetResponse().Header.Get("Content-Type"), "html") } -func HTMLOutlinks(doc *goquery.Document, URL *models.URL) (outlinks []*models.URL, err error) { +func HTMLOutlinks(URL *models.URL) (outlinks []*models.URL, err error) { defer URL.RewindBody() // logger := log.NewFieldedLogger(&log.Fields{ @@ -52,9 +52,9 @@ func HTMLOutlinks(doc *goquery.Document, URL *models.URL) (outlinks []*models.UR "srcset", } - doc.Find("a").Each(func(index int, item *goquery.Selection) { + URL.GetDocument().Find("a").Each(func(index int, i *goquery.Selection) { for _, attr := range validAssetAttributes { - link, exists := item.Attr(attr) + link, exists := i.Attr(attr) if exists { if utils.StringContainsSliceElements(link, validAssetPath) { rawOutlinks = append(rawOutlinks, link) @@ -74,8 +74,8 @@ func HTMLOutlinks(doc *goquery.Document, URL *models.URL) (outlinks []*models.UR return outlinks, nil } -func HTMLAssets(doc *goquery.Document, URL *models.URL, seed *models.Item) (assets []*models.URL, err error) { - defer URL.RewindBody() +func HTMLAssets(item *models.Item) (assets []*models.URL, err error) { + defer item.GetURL().RewindBody() logger := log.NewFieldedLogger(&log.Fields{ "component": "postprocessor.extractor.HTMLAssets", @@ -83,13 +83,18 @@ func HTMLAssets(doc *goquery.Document, URL *models.URL, seed *models.Item) (asse var rawAssets []string + if item.GetURL().GetDocument() == nil { + logger.Error("no document in URL struct", "url", item.GetURL().String(), "item", item.GetShortID()) + return + } + // Get assets from JSON payloads in data-item values - doc.Find("[data-item]").Each(func(index int, item *goquery.Selection) { - dataItem, exists := item.Attr("data-item") + item.GetURL().GetDocument().Find("[data-item]").Each(func(index int, i *goquery.Selection) { + dataItem, exists := i.Attr("data-item") if exists { URLsFromJSON, err := GetURLsFromJSON([]byte(dataItem)) if err != nil { - logger.Debug("unable to extract URLs from JSON in data-item attribute", "err", err, "url", URL.String(), "item", seed.GetShortID()) + logger.Debug("unable to extract URLs from JSON in data-item attribute", "err", err, "url", item.GetURL().String(), "item", item.GetShortID()) } else { rawAssets = append(rawAssets, URLsFromJSON...) } @@ -97,8 +102,8 @@ func HTMLAssets(doc *goquery.Document, URL *models.URL, seed *models.Item) (asse }) // Check all elements style attributes for background-image & also data-preview - doc.Find("*").Each(func(index int, item *goquery.Selection) { - style, exists := item.Attr("style") + item.GetURL().GetDocument().Find("*").Each(func(index int, i *goquery.Selection) { + style, exists := i.Attr("style") if exists { matches := backgroundImageRegex.FindAllStringSubmatch(style, -1) @@ -107,7 +112,13 @@ func HTMLAssets(doc *goquery.Document, URL *models.URL, seed *models.Item) (asse matchFound := matches[match][1] // Don't extract CSS elements that aren't URLs - if strings.Contains(matchFound, "%") || strings.HasPrefix(matchFound, "0.") || strings.HasPrefix(matchFound, "--font") || strings.HasPrefix(matchFound, "--size") || strings.HasPrefix(matchFound, "--color") || strings.HasPrefix(matchFound, "--shreddit") || strings.HasPrefix(matchFound, "100vh") { + if strings.Contains(matchFound, "%") || + strings.HasPrefix(matchFound, "0.") || + strings.HasPrefix(matchFound, "--font") || + strings.HasPrefix(matchFound, "--size") || + strings.HasPrefix(matchFound, "--color") || + strings.HasPrefix(matchFound, "--shreddit") || + strings.HasPrefix(matchFound, "100vh") { continue } @@ -116,7 +127,7 @@ func HTMLAssets(doc *goquery.Document, URL *models.URL, seed *models.Item) (asse } } - dataPreview, exists := item.Attr("data-preview") + dataPreview, exists := i.Attr("data-preview") if exists { if strings.HasPrefix(dataPreview, "http") { rawAssets = append(rawAssets, dataPreview) @@ -126,23 +137,23 @@ func HTMLAssets(doc *goquery.Document, URL *models.URL, seed *models.Item) (asse // Extract assets on the page (images, scripts, videos..) if !utils.StringInSlice("img", config.Get().DisableHTMLTag) { - doc.Find("img").Each(func(index int, item *goquery.Selection) { - link, exists := item.Attr("src") + item.GetURL().GetDocument().Find("img").Each(func(index int, i *goquery.Selection) { + link, exists := i.Attr("src") if exists { rawAssets = append(rawAssets, link) } - link, exists = item.Attr("data-src") + link, exists = i.Attr("data-src") if exists { rawAssets = append(rawAssets, link) } - link, exists = item.Attr("data-lazy-src") + link, exists = i.Attr("data-lazy-src") if exists { rawAssets = append(rawAssets, link) } - link, exists = item.Attr("data-srcset") + link, exists = i.Attr("data-srcset") if exists { links := strings.Split(link, ",") for _, link := range links { @@ -150,7 +161,7 @@ func HTMLAssets(doc *goquery.Document, URL *models.URL, seed *models.Item) (asse } } - link, exists = item.Attr("srcset") + link, exists = i.Attr("srcset") if exists { links := strings.Split(link, ",") for _, link := range links { @@ -161,8 +172,8 @@ func HTMLAssets(doc *goquery.Document, URL *models.URL, seed *models.Item) (asse } if !utils.StringInSlice("video", config.Get().DisableHTMLTag) { - doc.Find("video").Each(func(index int, item *goquery.Selection) { - link, exists := item.Attr("src") + item.GetURL().GetDocument().Find("video").Each(func(index int, i *goquery.Selection) { + link, exists := i.Attr("src") if exists { rawAssets = append(rawAssets, link) } @@ -170,8 +181,8 @@ func HTMLAssets(doc *goquery.Document, URL *models.URL, seed *models.Item) (asse } if !utils.StringInSlice("style", config.Get().DisableHTMLTag) { - doc.Find("style").Each(func(index int, item *goquery.Selection) { - matches := urlRegex.FindAllStringSubmatch(item.Text(), -1) + item.GetURL().GetDocument().Find("style").Each(func(index int, i *goquery.Selection) { + matches := urlRegex.FindAllStringSubmatch(i.Text(), -1) for match := range matches { matchReplacement := matches[match][1] matchReplacement = strings.Replace(matchReplacement, "'", "", -1) @@ -192,16 +203,16 @@ func HTMLAssets(doc *goquery.Document, URL *models.URL, seed *models.Item) (asse } if !utils.StringInSlice("script", config.Get().DisableHTMLTag) { - doc.Find("script").Each(func(index int, item *goquery.Selection) { - link, exists := item.Attr("src") + item.GetURL().GetDocument().Find("script").Each(func(index int, i *goquery.Selection) { + link, exists := i.Attr("src") if exists { rawAssets = append(rawAssets, link) } - scriptType, exists := item.Attr("type") + scriptType, exists := i.Attr("type") if exists { if scriptType == "application/json" { - URLsFromJSON, err := GetURLsFromJSON([]byte(item.Text())) + URLsFromJSON, err := GetURLsFromJSON([]byte(i.Text())) if err != nil { // TODO: maybe add back when https://github.com/internetarchive/Zeno/issues/147 is fixed // c.Log.Debug("unable to extract URLs from JSON in script tag", "error", err, "url", URL) @@ -212,9 +223,9 @@ func HTMLAssets(doc *goquery.Document, URL *models.URL, seed *models.Item) (asse } // Apply regex on the script's HTML to extract potential assets - outerHTML, err := goquery.OuterHtml(item) + outerHTML, err := goquery.OuterHtml(i) if err != nil { - logger.Debug("unable to extract outer HTML from script tag", "err", err, "url", URL.String(), "item", seed.GetShortID()) + logger.Debug("unable to extract outer HTML from script tag", "err", err, "url", item.GetURL().String(), "item", item.GetShortID()) } else { scriptLinks := utils.DedupeStrings(LinkRegexRelaxed.FindAllString(outerHTML, -1)) for _, scriptLink := range scriptLinks { @@ -222,7 +233,7 @@ func HTMLAssets(doc *goquery.Document, URL *models.URL, seed *models.Item) (asse // Escape URLs when unicode runes are present in the extracted URLs scriptLink, err := strconv.Unquote(`"` + scriptLink + `"`) if err != nil { - logger.Debug("unable to escape URL from JSON in script tag", "error", err, "url", scriptLink, "item", seed.GetShortID()) + logger.Debug("unable to escape URL from JSON in script tag", "error", err, "url", scriptLink, "item", item.GetShortID()) continue } rawAssets = append(rawAssets, scriptLink) @@ -231,8 +242,8 @@ func HTMLAssets(doc *goquery.Document, URL *models.URL, seed *models.Item) (asse } // Some