diff --git a/.gitignore b/.gitignore
index 98455fe3..28459922 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,8 +2,12 @@ jobs/*
jobs/
Zeno
*.txt
-*.sh
+*.sh*
zeno.log
.vscode/
*.py
-.DS_Store
\ No newline at end of file
+.DS_Store
+ZENO-*
+output.log
+.old/
+*.warc.*
\ No newline at end of file
diff --git a/.old/go.mod b/.old/go.mod
new file mode 100644
index 00000000..1200c6fa
--- /dev/null
+++ b/.old/go.mod
@@ -0,0 +1,95 @@
+module github.com/internetarchive/Zeno
+
+go 1.22.4
+
+require (
+ github.com/CorentinB/warc v0.8.53
+ github.com/PuerkitoBio/goquery v1.9.3
+ github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2
+ github.com/clbanning/mxj/v2 v2.7.0
+ github.com/dustin/go-humanize v1.0.1
+ github.com/elastic/go-elasticsearch/v8 v8.15.0
+ github.com/google/uuid v1.6.0
+ github.com/gosuri/uilive v0.0.4
+ github.com/gosuri/uitable v0.0.4
+ github.com/grafov/m3u8 v0.12.0
+ github.com/internetarchive/gocrawlhq v1.2.20
+ github.com/paulbellamy/ratecounter v0.2.0
+ github.com/philippgille/gokv/leveldb v0.7.0
+ github.com/prometheus/client_golang v1.20.4
+ github.com/remeh/sizedwaitgroup v1.0.0
+ github.com/sirupsen/logrus v1.9.3
+ github.com/spf13/cobra v1.8.1
+ github.com/spf13/pflag v1.0.5
+ github.com/spf13/viper v1.19.0
+ github.com/telanflow/cookiejar v0.0.0-20190719062046-114449e86aa5
+ go.uber.org/goleak v1.3.0
+ golang.org/x/net v0.29.0
+ google.golang.org/protobuf v1.34.2
+ mvdan.cc/xurls/v2 v2.5.0
+)
+
+require (
+ github.com/andybalholm/brotli v1.1.0 // indirect
+ github.com/andybalholm/cascadia v1.3.2 // indirect
+ github.com/aws/aws-sdk-go v1.55.5 // indirect
+ github.com/beorn7/perks v1.0.1 // indirect
+ github.com/cespare/xxhash/v2 v2.3.0 // indirect
+ github.com/cloudflare/circl v1.4.0 // indirect
+ github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
+ github.com/elastic/elastic-transport-go/v8 v8.6.0 // indirect
+ github.com/fatih/color v1.17.0 // indirect
+ github.com/fsnotify/fsnotify v1.7.0 // indirect
+ github.com/go-logr/logr v1.4.2 // indirect
+ github.com/go-logr/stdr v1.2.2 // indirect
+ github.com/gobwas/httphead v0.1.0 // indirect
+ github.com/gobwas/pool v0.2.1 // indirect
+ github.com/gobwas/ws v1.4.0 // indirect
+ github.com/golang/snappy v0.0.4 // indirect
+ github.com/gomodule/redigo v1.9.2 // indirect
+ github.com/google/go-cmp v0.6.0 // indirect
+ github.com/hashicorp/hcl v1.0.0 // indirect
+ github.com/inconshreveable/mousetrap v1.1.0 // indirect
+ github.com/jmespath/go-jmespath v0.4.0 // indirect
+ github.com/json-iterator/go v1.1.12 // indirect
+ github.com/klauspost/compress v1.17.10 // indirect
+ github.com/magiconair/properties v1.8.7 // indirect
+ github.com/mattn/go-colorable v0.1.13 // indirect
+ github.com/mattn/go-isatty v0.0.20 // indirect
+ github.com/mattn/go-runewidth v0.0.16 // indirect
+ github.com/miekg/dns v1.1.62 // indirect
+ github.com/mitchellh/mapstructure v1.5.0 // indirect
+ github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
+ github.com/modern-go/reflect2 v1.0.2 // indirect
+ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
+ github.com/onsi/gomega v1.34.2 // indirect
+ github.com/pelletier/go-toml/v2 v2.2.3 // indirect
+ github.com/philippgille/gokv/encoding v0.7.0 // indirect
+ github.com/philippgille/gokv/util v0.7.0 // indirect
+ github.com/prometheus/client_model v0.6.1 // indirect
+ github.com/prometheus/common v0.59.1 // indirect
+ github.com/prometheus/procfs v0.15.1 // indirect
+ github.com/refraction-networking/utls v1.6.7 // indirect
+ github.com/rivo/uniseg v0.4.7 // indirect
+ github.com/sagikazarmark/locafero v0.6.0 // indirect
+ github.com/sagikazarmark/slog-shim v0.1.0 // indirect
+ github.com/sourcegraph/conc v0.3.0 // indirect
+ github.com/spf13/afero v1.11.0 // indirect
+ github.com/spf13/cast v1.7.0 // indirect
+ github.com/subosito/gotenv v1.6.0 // indirect
+ github.com/syndtr/goleveldb v1.0.0 // indirect
+ github.com/ulikunitz/xz v0.5.12 // indirect
+ go.opentelemetry.io/otel v1.30.0 // indirect
+ go.opentelemetry.io/otel/metric v1.30.0 // indirect
+ go.opentelemetry.io/otel/trace v1.30.0 // indirect
+ go.uber.org/multierr v1.11.0 // indirect
+ golang.org/x/crypto v0.27.0 // indirect
+ golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 // indirect
+ golang.org/x/mod v0.21.0 // indirect
+ golang.org/x/sync v0.8.0 // indirect
+ golang.org/x/sys v0.26.0 // indirect
+ golang.org/x/text v0.18.0 // indirect
+ golang.org/x/tools v0.25.0 // indirect
+ gopkg.in/ini.v1 v1.67.0 // indirect
+ gopkg.in/yaml.v3 v3.0.1 // indirect
+)
diff --git a/.old/go.sum b/.old/go.sum
new file mode 100644
index 00000000..a4f66821
--- /dev/null
+++ b/.old/go.sum
@@ -0,0 +1,291 @@
+git.archive.org/wb/gocrawlhq v1.2.13 h1:PqEhgtYqNEUWO2JEJUHmXT+nIwW9LRgb4ocUFANciQo=
+git.archive.org/wb/gocrawlhq v1.2.13/go.mod h1:JQIKgebFmpbxmEalNRjID3RwCxHkslt3PHAnum82KtM=
+github.com/CorentinB/warc v0.8.52 h1:k6lkq3uh6PkhZG+WKpPEkeQPmO1byb7MnSZaNT28SH4=
+github.com/CorentinB/warc v0.8.52/go.mod h1:NblONkMtoBB4TIigew6F6vakzu0z3YQTKNFS8U2FIn8=
+github.com/CorentinB/warc v0.8.53 h1:xVz3RMdZ6faAqTtLfcK1/yl8ZTansy+B2en//EZLUlM=
+github.com/CorentinB/warc v0.8.53/go.mod h1:NblONkMtoBB4TIigew6F6vakzu0z3YQTKNFS8U2FIn8=
+github.com/PuerkitoBio/goquery v1.9.3 h1:mpJr/ikUA9/GNJB/DBZcGeFDXUtosHRyRrwh7KGdTG0=
+github.com/PuerkitoBio/goquery v1.9.3/go.mod h1:1ndLHPdTz+DyQPICCWYlYQMPl0oXZj0G6D4LCYA6u4U=
+github.com/PuerkitoBio/goquery v1.10.0 h1:6fiXdLuUvYs2OJSvNRqlNPoBm6YABE226xrbavY5Wv4=
+github.com/PuerkitoBio/goquery v1.10.0/go.mod h1:TjZZl68Q3eGHNBA8CWaxAN7rOU1EbDz3CWuolcO5Yu4=
+github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M=
+github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY=
+github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
+github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
+github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
+github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs=
+github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 h1:DklsrG3dyBCFEj5IhUbnKptjxatkF07cF2ak3yi77so=
+github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw=
+github.com/aws/aws-sdk-go v1.55.5 h1:KKUZBfBoyqy5d3swXyiC7Q76ic40rYcbqH7qjh59kzU=
+github.com/aws/aws-sdk-go v1.55.5/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU=
+github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
+github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
+github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
+github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
+github.com/clbanning/mxj/v2 v2.7.0 h1:WA/La7UGCanFe5NpHF0Q3DNtnCsVoxbPKuyBNHWRyME=
+github.com/clbanning/mxj/v2 v2.7.0/go.mod h1:hNiWqW14h+kc+MdF9C6/YoRfjEJoR3ou6tn/Qo+ve2s=
+github.com/cloudflare/circl v1.4.0 h1:BV7h5MgrktNzytKmWjpOtdYrf0lkkbF8YMlBGPhJQrY=
+github.com/cloudflare/circl v1.4.0/go.mod h1:PDRU+oXvdD7KCtgKxW95M5Z8BpSCJXQORiZFnBQS5QU=
+github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
+github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
+github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
+github.com/elastic/elastic-transport-go/v8 v8.6.0 h1:Y2S/FBjx1LlCv5m6pWAF2kDJAHoSjSRSJCApolgfthA=
+github.com/elastic/elastic-transport-go/v8 v8.6.0/go.mod h1:YLHer5cj0csTzNFXoNQ8qhtGY1GTvSqPnKWKaqQE3Hk=
+github.com/elastic/go-elasticsearch/v8 v8.15.0 h1:IZyJhe7t7WI3NEFdcHnf6IJXqpRf+8S8QWLtZYYyBYk=
+github.com/elastic/go-elasticsearch/v8 v8.15.0/go.mod h1:HCON3zj4btpqs2N1jjsAy4a/fiAul+YBP00mBH4xik8=
+github.com/fatih/color v1.17.0 h1:GlRw1BRJxkpqUCBKzKOw098ed57fEsKeNjpTe3cSjK4=
+github.com/fatih/color v1.17.0/go.mod h1:YZ7TlrGPkiz6ku9fK3TLD/pl3CpsiFyu8N92HLgmosI=
+github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8=
+github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0=
+github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
+github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA=
+github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM=
+github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
+github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY=
+github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
+github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
+github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
+github.com/go-test/deep v1.1.0 h1:WOcxcdHcvdgThNXjw0t76K42FXTU7HpNQWHpA2HHNlg=
+github.com/go-test/deep v1.1.0/go.mod h1:5C2ZWiW0ErCdrYzpqxLbTX7MG14M9iiw8DgHncVwcsE=
+github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU=
+github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM=
+github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og=
+github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
+github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs=
+github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc=
+github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
+github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM=
+github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
+github.com/gomodule/redigo v1.9.2 h1:HrutZBLhSIU8abiSfW8pj8mPhOyMYjZT/wcA4/L9L9s=
+github.com/gomodule/redigo v1.9.2/go.mod h1:KsU3hiK/Ay8U42qpaJk+kuNa3C+spxapWpM+ywhcgtw=
+github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
+github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
+github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
+github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
+github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/gosuri/uilive v0.0.4 h1:hUEBpQDj8D8jXgtCdBu7sWsy5sbW/5GhuO8KBwJ2jyY=
+github.com/gosuri/uilive v0.0.4/go.mod h1:V/epo5LjjlDE5RJUcqx8dbw+zc93y5Ya3yg8tfZ74VI=
+github.com/gosuri/uitable v0.0.4 h1:IG2xLKRvErL3uhY6e1BylFzG+aJiwQviDDTfOKeKTpY=
+github.com/gosuri/uitable v0.0.4/go.mod h1:tKR86bXuXPZazfOTG1FIzvjIdXzd0mo4Vtn16vt0PJo=
+github.com/grafov/m3u8 v0.12.0 h1:T6iTwTsSEtMcwkayef+FJO8kj+Sglr4Lh81Zj8Ked/4=
+github.com/grafov/m3u8 v0.12.0/go.mod h1:nqzOkfBiZJENr52zTVd/Dcl03yzphIMbJqkXGu+u080=
+github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4=
+github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
+github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI=
+github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
+github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
+github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
+github.com/internetarchive/gocrawlhq v1.2.13 h1:ALfUrWR7nRez5gWhHRJ7ZklIpGMjERGMUJqR4HBl4+8=
+github.com/internetarchive/gocrawlhq v1.2.13/go.mod h1:JQIKgebFmpbxmEalNRjID3RwCxHkslt3PHAnum82KtM=
+github.com/internetarchive/gocrawlhq v1.2.14 h1:g3MPMonpA6mTkCpjBvW3paeBHiH+gGgwSvkyX/lxu7s=
+github.com/internetarchive/gocrawlhq v1.2.14/go.mod h1:IOHVfWsptADzh+r2J+UnSm22EB9r8TiVVeAuP9WRFoc=
+github.com/internetarchive/gocrawlhq v1.2.15 h1:Llv6tvxxRUxoC9G4GsjkpbfKX0anbQUU+pwFiROlxzg=
+github.com/internetarchive/gocrawlhq v1.2.15/go.mod h1:Rjkyx2ttWDG4vzXOrl7ilzdtbODJ3XSe2PkO77bxSTs=
+github.com/internetarchive/gocrawlhq v1.2.16 h1:D9JJdLL8uqpHUDU3SxxcXUjQETbxnk08e9xo929xrlE=
+github.com/internetarchive/gocrawlhq v1.2.16/go.mod h1:Rjkyx2ttWDG4vzXOrl7ilzdtbODJ3XSe2PkO77bxSTs=
+github.com/internetarchive/gocrawlhq v1.2.17 h1:nSjFHpDp5C9Q8SrDPibC4Iiih6kpw18+2GnifJiVpO0=
+github.com/internetarchive/gocrawlhq v1.2.17/go.mod h1:Rjkyx2ttWDG4vzXOrl7ilzdtbODJ3XSe2PkO77bxSTs=
+github.com/internetarchive/gocrawlhq v1.2.18 h1:PPe7UqJ2NNOljn70SmUhoKdgPreeqRUk9XVrYShCn4w=
+github.com/internetarchive/gocrawlhq v1.2.18/go.mod h1:Rjkyx2ttWDG4vzXOrl7ilzdtbODJ3XSe2PkO77bxSTs=
+github.com/internetarchive/gocrawlhq v1.2.19 h1:bvDliaeWjt97x64bOf+rKXStQX7VE+ZON/I1FS3sQ6A=
+github.com/internetarchive/gocrawlhq v1.2.19/go.mod h1:gHrdMewIi5OBWE/xEZGqSrNHyTXPbt+h+XUWpp9fZek=
+github.com/internetarchive/gocrawlhq v1.2.20 h1:0mIIt9lhPacKr6L2JeISoopQ8EgzC3dISJ3ITGGbOp4=
+github.com/internetarchive/gocrawlhq v1.2.20/go.mod h1:gHrdMewIi5OBWE/xEZGqSrNHyTXPbt+h+XUWpp9fZek=
+github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg=
+github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
+github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U=
+github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
+github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
+github.com/klauspost/compress v1.17.10 h1:oXAz+Vh0PMUvJczoi+flxpnBEPxoER1IaAnU/NMPtT0=
+github.com/klauspost/compress v1.17.10/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0=
+github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
+github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
+github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
+github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
+github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
+github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
+github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY=
+github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0=
+github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
+github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
+github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
+github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
+github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
+github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc=
+github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
+github.com/miekg/dns v1.1.62 h1:cN8OuEF1/x5Rq6Np+h1epln8OiyPWV+lROx9LxcGgIQ=
+github.com/miekg/dns v1.1.62/go.mod h1:mvDlcItzm+br7MToIKqkglaGhlFMHJ9DTNNWONWXbNQ=
+github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY=
+github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo=
+github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
+github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
+github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
+github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
+github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
+github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
+github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
+github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
+github.com/onsi/ginkgo v1.7.0 h1:WSHQ+IS43OoUrWtD1/bbclrwK8TTH5hzp+umCiuxHgs=
+github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
+github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY=
+github.com/onsi/gomega v1.34.2 h1:pNCwDkzrsv7MS9kpaQvVb1aVLahQXyJ/Tv5oAZMI3i8=
+github.com/onsi/gomega v1.34.2/go.mod h1:v1xfxRgk0KIsG+QOdm7p8UosrOzPYRo60fd3B/1Dukc=
+github.com/paulbellamy/ratecounter v0.2.0 h1:2L/RhJq+HA8gBQImDXtLPrDXK5qAj6ozWVK/zFXVJGs=
+github.com/paulbellamy/ratecounter v0.2.0/go.mod h1:Hfx1hDpSGoqxkVVpBi/IlYD7kChlfo5C6hzIHwPqfFE=
+github.com/pelletier/go-toml/v2 v2.2.3 h1:YmeHyLY8mFWbdkNWwpr+qIL2bEqT0o95WSdkNHvL12M=
+github.com/pelletier/go-toml/v2 v2.2.3/go.mod h1:MfCQTFTvCcUyyvvwm1+G6H/jORL20Xlb6rzQu9GuUkc=
+github.com/philippgille/gokv v0.7.0 h1:rQSIQspete82h78Br7k7rKUZ8JYy/hWlwzm/W5qobPI=
+github.com/philippgille/gokv v0.7.0/go.mod h1:OwiTP/3bhEBhSuOmFmq1+rszglfSgjJVxd1HOgOa2N4=
+github.com/philippgille/gokv/encoding v0.7.0 h1:2oxepKzzTsi00iLZBCZ7Rmqrallh9zws3iqSrLGfkgo=
+github.com/philippgille/gokv/encoding v0.7.0/go.mod h1:yncOBBUciyniPI8t5ECF8XSCwhONE9Rjf3My5IHs3fA=
+github.com/philippgille/gokv/leveldb v0.7.0 h1:QTH83utBG8knTTFzO1tIF1amKIjz9xxOPLaZrU48kdQ=
+github.com/philippgille/gokv/leveldb v0.7.0/go.mod h1:EE/vyZ5VwPPWwZHKJYWd/rkqUIJXFykKA5eluazFByc=
+github.com/philippgille/gokv/test v0.7.0 h1:0wBKnKaFZlSeHxLXcmUJqK//IQGUMeu+o8B876KCiOM=
+github.com/philippgille/gokv/test v0.7.0/go.mod h1:TP/VzO/qAoi6njsfKnRpXKno0hRuzD5wsLnHhtUcVkY=
+github.com/philippgille/gokv/util v0.7.0 h1:5avUK/a3aSj/aWjhHv4/FkqgMon2B7k2BqFgLcR+DYg=
+github.com/philippgille/gokv/util v0.7.0/go.mod h1:i9KLHbPxGiHLMhkix/CcDQhpPbCkJy5BkW+RKgwDHMo=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
+github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/prometheus/client_golang v1.20.4 h1:Tgh3Yr67PaOv/uTqloMsCEdeuFTatm5zIq5+qNN23vI=
+github.com/prometheus/client_golang v1.20.4/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE=
+github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
+github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
+github.com/prometheus/common v0.59.1 h1:LXb1quJHWm1P6wq/U824uxYi4Sg0oGvNeUm1z5dJoX0=
+github.com/prometheus/common v0.59.1/go.mod h1:GpWM7dewqmVYcd7SmRaiWVe9SSqjf0UrwnYnpEZNuT0=
+github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
+github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
+github.com/refraction-networking/utls v1.6.7 h1:zVJ7sP1dJx/WtVuITug3qYUq034cDq9B2MR1K67ULZM=
+github.com/refraction-networking/utls v1.6.7/go.mod h1:BC3O4vQzye5hqpmDTWUqi4P5DDhzJfkV1tdqtawQIH0=
+github.com/remeh/sizedwaitgroup v1.0.0 h1:VNGGFwNo/R5+MJBf6yrsr110p0m4/OX4S3DCy7Kyl5E=
+github.com/remeh/sizedwaitgroup v1.0.0/go.mod h1:3j2R4OIe/SeS6YDhICBy22RWjJC5eNCJ1V+9+NVNYlo=
+github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
+github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
+github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
+github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
+github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
+github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
+github.com/sagikazarmark/locafero v0.6.0 h1:ON7AQg37yzcRPU69mt7gwhFEBwxI6P9T4Qu3N51bwOk=
+github.com/sagikazarmark/locafero v0.6.0/go.mod h1:77OmuIc6VTraTXKXIs/uvUxKGUXjE1GbemJYHqdNjX0=
+github.com/sagikazarmark/slog-shim v0.1.0 h1:diDBnUNK9N/354PgrxMywXnAwEr1QZcOr6gto+ugjYE=
+github.com/sagikazarmark/slog-shim v0.1.0/go.mod h1:SrcSrq8aKtyuqEI1uvTDTK1arOWRIczQRv+GVI1AkeQ=
+github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
+github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
+github.com/sourcegraph/conc v0.3.0 h1:OQTbbt6P72L20UqAkXXuLOj79LfEanQ+YQFNpLA9ySo=
+github.com/sourcegraph/conc v0.3.0/go.mod h1:Sdozi7LEKbFPqYX2/J+iBAM6HpqSLTASQIKqDmF7Mt0=
+github.com/spf13/afero v1.11.0 h1:WJQKhtpdm3v2IzqG8VMqrr6Rf3UYpEF239Jy9wNepM8=
+github.com/spf13/afero v1.11.0/go.mod h1:GH9Y3pIexgf1MTIWtNGyogA5MwRIDXGUr+hbWNoBjkY=
+github.com/spf13/cast v1.7.0 h1:ntdiHjuueXFgm5nzDRdOS4yfT43P5Fnud6DH50rz/7w=
+github.com/spf13/cast v1.7.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo=
+github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM=
+github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y=
+github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
+github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
+github.com/spf13/viper v1.19.0 h1:RWq5SEjt8o25SROyN3z2OrDB9l7RPd3lwTWU8EcEdcI=
+github.com/spf13/viper v1.19.0/go.mod h1:GQUN9bilAbhU/jgc1bKs99f/suXKeUMct8Adx5+Ntkg=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
+github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
+github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
+github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8=
+github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU=
+github.com/syndtr/goleveldb v1.0.0 h1:fBdIW9lB4Iz0n9khmH8w27SJ3QEJ7+IgjPEwGSZiFdE=
+github.com/syndtr/goleveldb v1.0.0/go.mod h1:ZVVdQEZoIme9iO1Ch2Jdy24qqXrMMOU6lpPAyBWyWuQ=
+github.com/telanflow/cookiejar v0.0.0-20190719062046-114449e86aa5 h1:gTQl5nPlc9B53vFOKM8aJHwxB2BW2kM49PVR5526GBg=
+github.com/telanflow/cookiejar v0.0.0-20190719062046-114449e86aa5/go.mod h1:qNgA5MKwTh103SxGTooqZMiKxZTaV9UV3KjN7I7Drig=
+github.com/ulikunitz/xz v0.5.12 h1:37Nm15o69RwBkXM0J6A5OlE67RZTfzUxTj8fB3dfcsc=
+github.com/ulikunitz/xz v0.5.12/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
+github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
+go.opentelemetry.io/otel v1.30.0 h1:F2t8sK4qf1fAmY9ua4ohFS/K+FUuOPemHUIXHtktrts=
+go.opentelemetry.io/otel v1.30.0/go.mod h1:tFw4Br9b7fOS+uEao81PJjVMjW/5fvNCbpsDIXqP0pc=
+go.opentelemetry.io/otel/metric v1.30.0 h1:4xNulvn9gjzo4hjg+wzIKG7iNFEaBMX00Qd4QIZs7+w=
+go.opentelemetry.io/otel/metric v1.30.0/go.mod h1:aXTfST94tswhWEb+5QjlSqG+cZlmyXy/u8jFpor3WqQ=
+go.opentelemetry.io/otel/sdk v1.21.0 h1:FTt8qirL1EysG6sTQRZ5TokkU8d0ugCj8htOgThZXQ8=
+go.opentelemetry.io/otel/sdk v1.21.0/go.mod h1:Nna6Yv7PWTdgJHVRD9hIYywQBRx7pbox6nwBnZIxl/E=
+go.opentelemetry.io/otel/trace v1.30.0 h1:7UBkkYzeg3C7kQX8VAidWh2biiQbtAKjyIML8dQ9wmc=
+go.opentelemetry.io/otel/trace v1.30.0/go.mod h1:5EyKqTzzmyqB9bwtCCq6pDLktPK6fmGf/Dph+8VI02o=
+go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
+go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
+go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
+go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
+golang.org/x/crypto v0.27.0 h1:GXm2NjJrPaiv/h1tb2UH8QfgC/hOf/+z0p6PT8o1w7A=
+golang.org/x/crypto v0.27.0/go.mod h1:1Xngt8kV6Dvbssa53Ziq6Eqn0HqbZi5Z6R0ZpwQzt70=
+golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 h1:e66Fs6Z+fZTbFBAxKfP3PALWBtpfqks2bwGcexMxgtk=
+golang.org/x/exp v0.0.0-20240909161429-701f63a606c0/go.mod h1:2TbTHSBQa924w8M6Xs1QcRcFwyucIwBGpK1p2f1YFFY=
+golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
+golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
+golang.org/x/mod v0.21.0 h1:vvrHzRwRfVKSiLrG+d4FMl/Qi4ukBCE6kZlTUkDYRT0=
+golang.org/x/mod v0.21.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY=
+golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
+golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
+golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
+golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
+golang.org/x/net v0.29.0 h1:5ORfpBpCs4HzDYoodCDBbwHzdR5UrLBZ3sOnUJmFoHo=
+golang.org/x/net v0.29.0/go.mod h1:gLkgy8jTGERgjzMic6DS9+SP0ajcu6Xu3Orq/SpETg0=
+golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ=
+golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34=
+golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo=
+golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
+golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
+golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
+golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
+golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
+golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
+golang.org/x/text v0.18.0 h1:XvMDiNzPAl0jr17s6W9lcaIhGUfUORdGCNsuLmPG224=
+golang.org/x/text v0.18.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
+golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
+golang.org/x/tools v0.25.0 h1:oFU9pkj/iJgs+0DT+VMHrx+oBKs/LJMV+Uvg78sl+fE=
+golang.org/x/tools v0.25.0/go.mod h1:/vtpO8WL1N9cQC3FN5zPqb//fRXskFHbLKk4OW1Q7rg=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg=
+google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
+gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
+gopkg.in/fsnotify.v1 v1.4.7 h1:xOHLXZwVvI9hhs+cLKq5+I5onOuwQLhQwiu63xxlHs4=
+gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys=
+gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA=
+gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
+gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ=
+gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
+gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
+gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+mvdan.cc/xurls/v2 v2.5.0 h1:lyBNOm8Wo71UknhUs4QTFUNNMyxy2JEIaKKo0RWOh+8=
+mvdan.cc/xurls/v2 v2.5.0/go.mod h1:yQgaGQ1rFtJUzkmKiHYSSfuQxqfYmd//X6PxvholpeE=
diff --git a/internal/pkg/crawl/api.go b/.old/internal/pkg/crawl/api.go
similarity index 100%
rename from internal/pkg/crawl/api.go
rename to .old/internal/pkg/crawl/api.go
diff --git a/internal/pkg/crawl/assets.go b/.old/internal/pkg/crawl/assets.go
similarity index 66%
rename from internal/pkg/crawl/assets.go
rename to .old/internal/pkg/crawl/assets.go
index 9aaa90eb..754602f4 100644
--- a/internal/pkg/crawl/assets.go
+++ b/.old/internal/pkg/crawl/assets.go
@@ -1,183 +1,178 @@
package crawl
import (
- "io"
- "net/http"
"net/url"
- "regexp"
"strconv"
"strings"
- "sync/atomic"
"github.com/PuerkitoBio/goquery"
"github.com/internetarchive/Zeno/internal/pkg/crawl/extractor"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream"
"github.com/internetarchive/Zeno/internal/pkg/queue"
"github.com/internetarchive/Zeno/internal/pkg/utils"
- "github.com/remeh/sizedwaitgroup"
)
-var backgroundImageRegex = regexp.MustCompile(`(?:\(['"]?)(.*?)(?:['"]?\))`)
-var urlRegex = regexp.MustCompile(`(?m)url\((.*?)\)`)
-
-func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie, headers map[string]string) error {
- var resp *http.Response
-
- // Prepare GET request
- req, err := http.NewRequest("GET", utils.URLToString(item.URL), nil)
- if err != nil {
- return err
- }
-
- req.Header.Set("Referer", utils.URLToString(item.ParentURL))
- req.Header.Set("User-Agent", c.UserAgent)
-
- // If headers are passed, apply them to the request
- if headers != nil {
- for key, value := range headers {
- req.Header.Set(key, value)
- }
- }
-
- // Apply cookies obtained from the original URL captured
- for i := range cookies {
- req.AddCookie(cookies[i])
- }
-
- resp, err = c.executeGET(item, req, false)
- if err != nil && err.Error() == "URL from redirection has already been seen" {
- return nil
- } else if err != nil {
- return err
- }
- defer resp.Body.Close()
-
- if extractor.IsM3U8(resp) {
- assets, err := extractor.M3U8(resp)
- if err == nil {
- assets = c.seencheckAssets(assets, item)
- if len(assets) != 0 {
- c.captureAssets(item, assets, cookies, headers)
- }
- } else {
- c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from M3U8")
- }
- }
-
- io.Copy(io.Discard, resp.Body)
-
- return nil
-}
-
-func (c *Crawl) captureAssets(item *queue.Item, assets []*url.URL, cookies []*http.Cookie, headers map[string]string) {
- // TODO: implement a counter for the number of assets
- // currently being processed
- // c.Frontier.QueueCount.Incr(int64(len(assets)))
- swg := sizedwaitgroup.New(int(c.MaxConcurrentAssets))
- excluded := false
-
- for _, asset := range assets {
- // TODO: implement a counter for the number of assets
- // currently being processed
- // c.Frontier.QueueCount.Incr(-1)
-
- // Just making sure we do not over archive by archiving the original URL
- if utils.URLToString(item.URL) == utils.URLToString(asset) {
- continue
- }
-
- // If the URL match any excluded string, we ignore it
- for _, excludedString := range c.ExcludedStrings {
- if strings.Contains(utils.URLToString(asset), excludedString) {
- excluded = true
- break
- }
- }
-
- if excluded {
- excluded = false
- continue
- }
-
- swg.Add()
- c.URIsPerSecond.Incr(1)
-
- go func(asset *url.URL, swg *sizedwaitgroup.SizedWaitGroup) {
- defer swg.Done()
-
- // Create the asset's item
- newAsset, err := queue.NewItem(asset, item.URL, "asset", item.Hop, "", false)
- if err != nil {
- c.Log.WithFields(c.genLogFields(err, asset, map[string]interface{}{
- "parentHop": item.Hop,
- "parentUrl": utils.URLToString(item.URL),
- "type": "asset",
- })).Error("error while creating asset item")
- return
- }
-
- // Capture the asset
- err = c.captureAsset(newAsset, cookies, headers)
- if err != nil {
- c.Log.WithFields(c.genLogFields(err, &asset, map[string]interface{}{
- "parentHop": item.Hop,
- "parentUrl": utils.URLToString(item.URL),
- "type": "asset",
- })).Error("error while capturing asset")
- return
- }
-
- // If we made it to this point, it means that the asset have been crawled successfully,
- // then we can increment the locallyCrawled variable
- atomic.AddUint64(&item.LocallyCrawled, 1)
- }(asset, &swg)
- }
-
- swg.Wait()
-}
-
-func (c *Crawl) seencheckAssets(assets []*url.URL, item *queue.Item) []*url.URL {
- if c.UseSeencheck {
- if c.UseHQ {
- seencheckedURLs, err := c.HQSeencheckURLs(assets)
- // We ignore the error here because we don't want to slow down the crawl
- // if HQ is down or if the request failed. So if we get an error, we just
- // continue with the original list of assets.
- if err != nil {
- c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{
- "urls": assets,
- "parentHop": item.Hop,
- "parentUrl": utils.URLToString(item.URL),
- })).Error("error while seenchecking assets via HQ")
- } else {
- assets = seencheckedURLs
- }
-
- if len(assets) == 0 {
- return []*url.URL{}
- }
- } else {
- seencheckedBatch := []*url.URL{}
-
- for _, URL := range assets {
- found := c.Seencheck.SeencheckURL(utils.URLToString(URL), "asset")
- if found {
- continue
- }
-
- seencheckedBatch = append(seencheckedBatch, URL)
- }
-
- if len(seencheckedBatch) == 0 {
- return []*url.URL{}
- }
-
- assets = seencheckedBatch
- }
- }
-
- return assets
-}
+// var backgroundImageRegex = regexp.MustCompile(`(?:\(['"]?)(.*?)(?:['"]?\))`)
+// var urlRegex = regexp.MustCompile(`(?m)url\((.*?)\)`)
+
+// func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie, headers map[string]string) error {
+// var resp *http.Response
+
+// // Prepare GET request
+// req, err := http.NewRequest("GET", utils.URLToString(item.URL), nil)
+// if err != nil {
+// return err
+// }
+
+// req.Header.Set("Referer", utils.URLToString(item.ParentURL))
+// req.Header.Set("User-Agent", c.UserAgent)
+
+// // If headers are passed, apply them to the request
+// if headers != nil {
+// for key, value := range headers {
+// req.Header.Set(key, value)
+// }
+// }
+
+// // Apply cookies obtained from the original URL captured
+// for i := range cookies {
+// req.AddCookie(cookies[i])
+// }
+
+// resp, err = c.executeGET(item, req, false)
+// if err != nil && err.Error() == "URL from redirection has already been seen" {
+// return nil
+// } else if err != nil {
+// return err
+// }
+// defer resp.Body.Close()
+
+// if extractor.IsM3U8(resp) {
+// assets, err := extractor.M3U8(resp)
+// if err == nil {
+// assets = c.seencheckAssets(assets, item)
+// if len(assets) != 0 {
+// c.captureAssets(item, assets, cookies, headers)
+// }
+// } else {
+// c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from M3U8")
+// }
+// }
+
+// io.Copy(io.Discard, resp.Body)
+
+// return nil
+// }
+
+// func (c *Crawl) captureAssets(item *queue.Item, assets []*url.URL, cookies []*http.Cookie, headers map[string]string) {
+// // TODO: implement a counter for the number of assets
+// // currently being processed
+// // c.Frontier.QueueCount.Incr(int64(len(assets)))
+// swg := sizedwaitgroup.New(int(c.MaxConcurrentAssets))
+// excluded := false
+
+// for _, asset := range assets {
+// // TODO: implement a counter for the number of assets
+// // currently being processed
+// // c.Frontier.QueueCount.Incr(-1)
+
+// // Just making sure we do not over archive by archiving the original URL
+// if utils.URLToString(item.URL) == utils.URLToString(asset) {
+// continue
+// }
+
+// // If the URL match any excluded string, we ignore it
+// for _, excludedString := range c.ExcludedStrings {
+// if strings.Contains(utils.URLToString(asset), excludedString) {
+// excluded = true
+// break
+// }
+// }
+
+// if excluded {
+// excluded = false
+// continue
+// }
+
+// swg.Add()
+// c.URIsPerSecond.Incr(1)
+
+// go func(asset *url.URL, swg *sizedwaitgroup.SizedWaitGroup) {
+// defer swg.Done()
+
+// // Create the asset's item
+// newAsset, err := queue.NewItem(asset, item.URL, "asset", item.Hop, "", false)
+// if err != nil {
+// c.Log.WithFields(c.genLogFields(err, asset, map[string]interface{}{
+// "parentHop": item.Hop,
+// "parentUrl": utils.URLToString(item.URL),
+// "type": "asset",
+// })).Error("error while creating asset item")
+// return
+// }
+
+// // Capture the asset
+// err = c.captureAsset(newAsset, cookies, headers)
+// if err != nil {
+// c.Log.WithFields(c.genLogFields(err, &asset, map[string]interface{}{
+// "parentHop": item.Hop,
+// "parentUrl": utils.URLToString(item.URL),
+// "type": "asset",
+// })).Error("error while capturing asset")
+// return
+// }
+
+// // If we made it to this point, it means that the asset have been crawled successfully,
+// // then we can increment the locallyCrawled variable
+// atomic.AddUint64(&item.LocallyCrawled, 1)
+// }(asset, &swg)
+// }
+
+// swg.Wait()
+// }
+
+// func (c *Crawl) seencheckAssets(assets []*url.URL, item *queue.Item) []*url.URL {
+// if c.UseSeencheck {
+// if c.UseHQ {
+// seencheckedURLs, err := c.HQSeencheckURLs(assets)
+// // We ignore the error here because we don't want to slow down the crawl
+// // if HQ is down or if the request failed. So if we get an error, we just
+// // continue with the original list of assets.
+// if err != nil {
+// c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{
+// "urls": assets,
+// "parentHop": item.Hop,
+// "parentUrl": utils.URLToString(item.URL),
+// })).Error("error while seenchecking assets via HQ")
+// } else {
+// assets = seencheckedURLs
+// }
+
+// if len(assets) == 0 {
+// return []*url.URL{}
+// }
+// } else {
+// seencheckedBatch := []*url.URL{}
+
+// for _, URL := range assets {
+// found := c.Seencheck.SeencheckURL(utils.URLToString(URL), "asset")
+// if found {
+// continue
+// }
+
+// seencheckedBatch = append(seencheckedBatch, URL)
+// }
+
+// if len(seencheckedBatch) == 0 {
+// return []*url.URL{}
+// }
+
+// assets = seencheckedBatch
+// }
+// }
+
+// return assets
+// }
func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Document) (assets []*url.URL, err error) {
var rawAssets []string
diff --git a/internal/pkg/crawl/capture.go b/.old/internal/pkg/crawl/capture.go
similarity index 95%
rename from internal/pkg/crawl/capture.go
rename to .old/internal/pkg/crawl/capture.go
index 0e7308c8..6171cb1d 100644
--- a/internal/pkg/crawl/capture.go
+++ b/.old/internal/pkg/crawl/capture.go
@@ -35,24 +35,24 @@ func (c *Crawl) executeGET(item *queue.Item, req *http.Request, isRedirection bo
URL *url.URL
)
- defer func() {
- if c.PrometheusMetrics != nil {
- c.PrometheusMetrics.DownloadedURI.Inc()
- }
+ // defer func() {
+ // if c.PrometheusMetrics != nil {
+ // c.PrometheusMetrics.DownloadedURI.Inc()
+ // }
- c.URIsPerSecond.Incr(1)
+ // c.URIsPerSecond.Incr(1)
- if item.Type == "seed" {
- c.CrawledSeeds.Incr(1)
- } else if item.Type == "asset" {
- c.CrawledAssets.Incr(1)
- }
- }()
+ // if item.Type == "seed" {
+ // c.CrawledSeeds.Incr(1)
+ // } else if item.Type == "asset" {
+ // c.CrawledAssets.Incr(1)
+ // }
+ // }()
- // Check if the crawl is paused
- for c.Paused.Get() {
- time.Sleep(time.Second)
- }
+ // // Check if the crawl is paused
+ // for c.Paused.Get() {
+ // time.Sleep(time.Second)
+ // }
// Retry on 429 error
for retry := uint8(0); retry < c.MaxRetry; retry++ {
@@ -414,26 +414,6 @@ func (c *Crawl) Capture(item *queue.Item) error {
}
return nil
- } else if ina.IsAPIURL(req) {
- rawAssets, err := ina.ExtractMedias(resp)
- if err != nil {
- c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract medias from INA")
- }
-
- if len(rawAssets) != 0 {
- assets = c.seencheckAssets(rawAssets, item)
-
- if len(assets) != 0 {
- for _, asset := range rawAssets {
- playerItem, err := queue.NewItem(asset, item.URL, "seed", 0, "", false)
- if err != nil {
- c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to create new item from asset")
- } else {
- c.Capture(playerItem)
- }
- }
- }
- }
}
// Scrape potential URLs from Link HTTP header
diff --git a/internal/pkg/crawl/config.go b/.old/internal/pkg/crawl/config.go
similarity index 98%
rename from internal/pkg/crawl/config.go
rename to .old/internal/pkg/crawl/config.go
index a2bae436..06f80fc6 100644
--- a/internal/pkg/crawl/config.go
+++ b/.old/internal/pkg/crawl/config.go
@@ -116,9 +116,7 @@ type Crawl struct {
HQStrategy string
HQBatchConcurrency int
HQBatchSize int
- HQContinuousPull bool
HQClient *gocrawlhq.Client
- HQConsumerState string
HQFinishedChannel chan *queue.Item
HQProducerChannel chan *queue.Item
HQChannelsWg *sync.WaitGroup
@@ -321,7 +319,6 @@ func GenerateCrawlConfig(config *config.Config) (*Crawl, error) {
c.HQStrategy = config.HQStrategy
c.HQBatchSize = int(config.HQBatchSize)
c.HQBatchConcurrency = config.HQBatchConcurrency
- c.HQContinuousPull = config.HQContinuousPull
c.HQRateLimitingSendBack = config.HQRateLimitSendBack
// Handover mechanism
diff --git a/internal/pkg/crawl/crawl.go b/.old/internal/pkg/crawl/crawl.go
similarity index 98%
rename from internal/pkg/crawl/crawl.go
rename to .old/internal/pkg/crawl/crawl.go
index 53fcff59..980e6665 100644
--- a/internal/pkg/crawl/crawl.go
+++ b/.old/internal/pkg/crawl/crawl.go
@@ -16,7 +16,7 @@ import (
"github.com/internetarchive/Zeno/internal/pkg/utils"
"github.com/internetarchive/gocrawlhq"
"github.com/prometheus/client_golang/prometheus"
- "github.com/telanflow/cookiejar"
+ "github.com/ssgelm/cookiejarparser"
"mvdan.cc/xurls/v2"
)
@@ -173,7 +173,7 @@ func (c *Crawl) Start() (err error) {
// Parse input cookie file if specified
if c.CookieFile != "" {
- cookieJar, err := cookiejar.NewFileJar(c.CookieFile, nil)
+ cookieJar, err := cookiejarparser.LoadCookieJarFile("cookies.txt")
if err != nil {
c.Log.WithFields(c.genLogFields(err, nil, nil)).Fatal("unable to parse cookie file")
}
diff --git a/internal/pkg/crawl/exclusion.go b/.old/internal/pkg/crawl/exclusion.go
similarity index 100%
rename from internal/pkg/crawl/exclusion.go
rename to .old/internal/pkg/crawl/exclusion.go
diff --git a/internal/pkg/crawl/extractor/json.go b/.old/internal/pkg/crawl/extractor/json.go
similarity index 100%
rename from internal/pkg/crawl/extractor/json.go
rename to .old/internal/pkg/crawl/extractor/json.go
diff --git a/internal/pkg/crawl/extractor/json_test.go b/.old/internal/pkg/crawl/extractor/json_test.go
similarity index 100%
rename from internal/pkg/crawl/extractor/json_test.go
rename to .old/internal/pkg/crawl/extractor/json_test.go
diff --git a/internal/pkg/crawl/extractor/m3u8.go b/.old/internal/pkg/crawl/extractor/m3u8.go
similarity index 100%
rename from internal/pkg/crawl/extractor/m3u8.go
rename to .old/internal/pkg/crawl/extractor/m3u8.go
diff --git a/internal/pkg/crawl/extractor/s3.go b/.old/internal/pkg/crawl/extractor/s3.go
similarity index 100%
rename from internal/pkg/crawl/extractor/s3.go
rename to .old/internal/pkg/crawl/extractor/s3.go
diff --git a/internal/pkg/crawl/extractor/utils.go b/.old/internal/pkg/crawl/extractor/utils.go
similarity index 100%
rename from internal/pkg/crawl/extractor/utils.go
rename to .old/internal/pkg/crawl/extractor/utils.go
diff --git a/internal/pkg/crawl/extractor/xml.go b/.old/internal/pkg/crawl/extractor/xml.go
similarity index 100%
rename from internal/pkg/crawl/extractor/xml.go
rename to .old/internal/pkg/crawl/extractor/xml.go
diff --git a/internal/pkg/crawl/extractor/xml_test.go b/.old/internal/pkg/crawl/extractor/xml_test.go
similarity index 100%
rename from internal/pkg/crawl/extractor/xml_test.go
rename to .old/internal/pkg/crawl/extractor/xml_test.go
diff --git a/internal/pkg/crawl/extractor/xml_test_sitemap.xml b/.old/internal/pkg/crawl/extractor/xml_test_sitemap.xml
similarity index 100%
rename from internal/pkg/crawl/extractor/xml_test_sitemap.xml
rename to .old/internal/pkg/crawl/extractor/xml_test_sitemap.xml
diff --git a/internal/pkg/crawl/finish.go b/.old/internal/pkg/crawl/finish.go
similarity index 100%
rename from internal/pkg/crawl/finish.go
rename to .old/internal/pkg/crawl/finish.go
diff --git a/internal/pkg/crawl/http_utils.go b/.old/internal/pkg/crawl/http_utils.go
similarity index 100%
rename from internal/pkg/crawl/http_utils.go
rename to .old/internal/pkg/crawl/http_utils.go
diff --git a/internal/pkg/crawl/log.go b/.old/internal/pkg/crawl/log.go
similarity index 100%
rename from internal/pkg/crawl/log.go
rename to .old/internal/pkg/crawl/log.go
diff --git a/internal/pkg/crawl/outlinks.go b/.old/internal/pkg/crawl/outlinks.go
similarity index 100%
rename from internal/pkg/crawl/outlinks.go
rename to .old/internal/pkg/crawl/outlinks.go
diff --git a/internal/pkg/crawl/stats.go b/.old/internal/pkg/crawl/stats.go
similarity index 100%
rename from internal/pkg/crawl/stats.go
rename to .old/internal/pkg/crawl/stats.go
diff --git a/internal/pkg/crawl/utils.go b/.old/internal/pkg/crawl/utils.go
similarity index 92%
rename from internal/pkg/crawl/utils.go
rename to .old/internal/pkg/crawl/utils.go
index d59434cd..8a7d718b 100644
--- a/internal/pkg/crawl/utils.go
+++ b/.old/internal/pkg/crawl/utils.go
@@ -82,12 +82,3 @@ func extractLinksFromText(source string) (links []*url.URL) {
// func (c *Crawl) shouldPause(host string) bool {
// return c.Frontier.GetActiveHostCount(host) >= c.MaxConcurrentRequestsPerDomain
// }
-
-func isStatusCodeRedirect(statusCode int) bool {
- if statusCode == 300 || statusCode == 301 ||
- statusCode == 302 || statusCode == 307 ||
- statusCode == 308 {
- return true
- }
- return false
-}
diff --git a/internal/pkg/crawl/warc.go b/.old/internal/pkg/crawl/warc.go
similarity index 100%
rename from internal/pkg/crawl/warc.go
rename to .old/internal/pkg/crawl/warc.go
diff --git a/internal/pkg/crawl/worker.go b/.old/internal/pkg/crawl/worker.go
similarity index 100%
rename from internal/pkg/crawl/worker.go
rename to .old/internal/pkg/crawl/worker.go
diff --git a/internal/pkg/crawl/worker_pool.go b/.old/internal/pkg/crawl/worker_pool.go
similarity index 100%
rename from internal/pkg/crawl/worker_pool.go
rename to .old/internal/pkg/crawl/worker_pool.go
diff --git a/internal/pkg/log/elasticsearch.go b/.old/internal/pkg/log/elasticsearch.go
similarity index 100%
rename from internal/pkg/log/elasticsearch.go
rename to .old/internal/pkg/log/elasticsearch.go
diff --git a/internal/pkg/log/file.go b/.old/internal/pkg/log/file.go
similarity index 100%
rename from internal/pkg/log/file.go
rename to .old/internal/pkg/log/file.go
diff --git a/.old/internal/pkg/log/log.go b/.old/internal/pkg/log/log.go
new file mode 100644
index 00000000..e5b56b61
--- /dev/null
+++ b/.old/internal/pkg/log/log.go
@@ -0,0 +1,262 @@
+// Package log provides a custom logging solution with multi-output support
+// and log rotation for file output.
+// -----------------------------------------------------------------------------
+// When Logger.{Debug, Info, Warn, Error, Fatal} is called, the log message is
+// passed to all underlying handlers represented by Logger.handler
+// Then multiHandler.Handle is called to pass the log message to all underlying handlers.
+// -----------------------------------------------------------------------------
+// The rotation mechanism works by locking the logger, checking if it's time to rotate,
+// and then calling the Rotate method on all rotatable handlers.
+package log
+
+import (
+ "context"
+ "fmt"
+ "log/slog"
+ "os"
+ "path/filepath"
+ "sync"
+ "sync/atomic"
+ "time"
+
+ "github.com/elastic/go-elasticsearch/v8"
+)
+
+var (
+ isLoggerInit *atomic.Bool
+ storedLogger *Logger
+ once sync.Once
+)
+
+// Logger wraps slog.Logger to provide multi-output functionality
+type Logger struct {
+ sync.Mutex
+ handler *multiHandler
+ slogger *slog.Logger
+ stopRotation chan struct{}
+ stopErrorLog chan struct{}
+ errorChan chan error
+}
+
+// Config holds the configuration for the logger
+type Config struct {
+ FileConfig *LogfileConfig
+ FileLevel slog.Level
+ StdoutEnabled bool
+ StdoutLevel slog.Level
+ RotateLogFile bool
+ ElasticsearchConfig *ElasticsearchConfig
+ RotateElasticSearchIndex bool
+ isDefault bool
+}
+
+// New creates a new Logger instance with the given configuration.
+// It sets up handlers for stdout (text format) and file output (JSON format) if specified.
+// If FileOutput is empty, only stdout logging will be enabled.
+// Only the first call to New will store the logger to be reused. Subsequent calls will return a new logger instance.
+// Only the first call to New will rotate the logs destinations.
+// Please refrain from calling New multiple times in the same program.
+//
+// Parameters:
+// - cfg: Config struct containing logger configuration options
+//
+// Returns:
+// - *Logger: A new Logger instance
+// - error: An error if there was a problem creating the logger (e.g., unable to open log file)
+func New(cfg Config) (*Logger, error) {
+ var handlers []slog.Handler
+
+ // Create stdout handler
+ if cfg.StdoutEnabled {
+ stdoutHandler := slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{
+ Level: cfg.StdoutLevel,
+ })
+ handlers = append(handlers, stdoutHandler)
+ }
+
+ // Create file handler if FileOutput is specified
+ if cfg.FileConfig != nil {
+ // Create directories if they don't exist
+ err := os.MkdirAll(filepath.Dir(cfg.FileConfig.Filename()), 0755)
+ if err != nil {
+ return nil, err
+ }
+
+ // Open log file
+ file, err := os.OpenFile(cfg.FileConfig.Filename(), os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666)
+ if err != nil {
+ return nil, err
+ }
+ fileHandler := &fileHandler{
+ Handler: slog.NewJSONHandler(file, &slog.HandlerOptions{Level: cfg.FileLevel}),
+ fileDescriptor: file,
+ rotationInterval: 6 * time.Hour,
+ lastRotation: time.Now(),
+ logfileConfig: cfg.FileConfig,
+ }
+ handlers = append(handlers, fileHandler)
+ }
+
+ // Create Elasticsearch handler if ElasticsearchConfig is specified
+ if cfg.ElasticsearchConfig != nil {
+ esClient, err := elasticsearch.NewClient(elasticsearch.Config{
+ Addresses: cfg.ElasticsearchConfig.Addresses,
+ Username: cfg.ElasticsearchConfig.Username,
+ Password: cfg.ElasticsearchConfig.Password,
+ })
+ if err != nil {
+ return nil, fmt.Errorf("failed to create Elasticsearch client: %w", err)
+ }
+ esHandler := &ElasticsearchHandler{
+ client: esClient,
+ index: fmt.Sprintf("%s-%s", cfg.ElasticsearchConfig.IndexPrefix, time.Now().Format("2006.01.02")),
+ level: cfg.ElasticsearchConfig.Level,
+ attrs: []slog.Attr{},
+ groups: []string{},
+ config: cfg.ElasticsearchConfig,
+ }
+ if err := esHandler.createIndex(); err != nil {
+ return nil, fmt.Errorf("failed to create Elasticsearch index: %w", err)
+ }
+ handlers = append(handlers, esHandler)
+ }
+
+ // Create multi-handler
+ mh := &multiHandler{handlers: handlers}
+
+ // Create slog.Logger
+ slogger := slog.New(mh)
+
+ logger := &Logger{
+ handler: mh,
+ slogger: slogger,
+ errorChan: make(chan error, 10),
+ stopErrorLog: make(chan struct{}),
+ }
+
+ if !cfg.isDefault {
+ once.Do(func() {
+ isLoggerInit = new(atomic.Bool)
+ storedLogger = logger
+ isLoggerInit.CompareAndSwap(false, true)
+
+ // Start rotation goroutine
+ logger.startRotation()
+ })
+ }
+
+ return logger, nil
+}
+
+// DefaultOrStored returns the default Logger instance or if already initialized, the logger created by first call to New().
+// The default logger writes to both stdout (text format) and a file named "app.log" (JSON format).
+// Both outputs are set to log messages at Info level and above.
+// This function uses sync.Once to ensure that the default logger is only created once.
+//
+// Returns:
+// - *Logger: The default Logger instance
+// - bool: True if the logger was created by this function, false if the logger was already initialized
+func DefaultOrStored() (*Logger, bool) {
+ var created = false
+ once.Do(func() {
+ isLoggerInit = new(atomic.Bool)
+ logger, err := New(Config{
+ FileConfig: &LogfileConfig{Dir: "jobs", Prefix: "zeno"},
+ FileLevel: slog.LevelInfo,
+ StdoutLevel: slog.LevelInfo,
+ isDefault: true,
+ })
+ if err != nil {
+ panic(err)
+ }
+ storedLogger = logger
+ created = isLoggerInit.CompareAndSwap(false, true)
+ })
+ return storedLogger, created
+}
+
+// GetStoredLogger returns the logger created by the first call to New() or DefaultOrStored().
+// If the logger has not been initialized, it will return nil.
+func GetStoredLogger() *Logger {
+ return storedLogger
+}
+
+// Errors returns a channel that will receive logging errors
+func (l *Logger) Errors() <-chan error {
+ return l.errorChan
+}
+
+func (l *Logger) log(level slog.Level, msg string, args ...any) {
+ l.Lock()
+ defer l.Unlock()
+
+ // Create a new Record with the message and args
+ r := slog.NewRecord(time.Now(), level, msg, 0)
+ r.Add(args...)
+
+ err := l.handler.Handle(context.Background(), r)
+ if err != nil {
+ select {
+ case l.errorChan <- err:
+ default:
+ // If the error channel is full, log to stderr as a last resort
+ fmt.Fprintf(os.Stderr, "Logging error: %v\n", err)
+ }
+ }
+}
+
+// Debug logs a message at Debug level.
+// The first argument is the message to log, and subsequent arguments are key-value pairs
+// that will be included in the log entry.
+//
+// Parameters:
+// - msg: The message to log
+// - args: Optional key-value pairs to include in the log entry
+func (l *Logger) Debug(msg string, args ...any) {
+ l.log(slog.LevelDebug, msg, args...)
+}
+
+// Info logs a message at Info level.
+// The first argument is the message to log, and subsequent arguments are key-value pairs
+// that will be included in the log entry.
+//
+// Parameters:
+// - msg: The message to log
+// - args: Optional key-value pairs to include in the log entry
+func (l *Logger) Info(msg string, args ...any) {
+ l.log(slog.LevelInfo, msg, args...)
+}
+
+// Warn logs a message at Warn level.
+// The first argument is the message to log, and subsequent arguments are key-value pairs
+// that will be included in the log entry.
+//
+// Parameters:
+// - msg: The message to log
+// - args: Optional key-value pairs to include in the log entry
+func (l *Logger) Warn(msg string, args ...any) {
+ l.log(slog.LevelWarn, msg, args...)
+}
+
+// Error logs a message at Error level.
+// The first argument is the message to log, and subsequent arguments are key-value pairs
+// that will be included in the log entry.
+//
+// Parameters:
+// - msg: The message to log
+// - args: Optional key-value pairs to include in the log entry
+func (l *Logger) Error(msg string, args ...any) {
+ l.log(slog.LevelError, msg, args...)
+}
+
+// Fatal logs a message at Error level and then calls os.Exit(1).
+// The first argument is the message to log, and subsequent arguments are key-value pairs
+// that will be included in the log entry.
+//
+// Parameters:
+// - msg: The message to log
+// - args: Optional key-value pairs to include in the log entry
+func (l *Logger) Fatal(msg string, args ...any) {
+ l.log(slog.LevelError, msg, args...)
+ os.Exit(1)
+}
diff --git a/internal/pkg/log/misc.go b/.old/internal/pkg/log/misc.go
similarity index 100%
rename from internal/pkg/log/misc.go
rename to .old/internal/pkg/log/misc.go
diff --git a/internal/pkg/log/multi_handler.go b/.old/internal/pkg/log/multi_handler.go
similarity index 100%
rename from internal/pkg/log/multi_handler.go
rename to .old/internal/pkg/log/multi_handler.go
diff --git a/internal/pkg/log/rotate.go b/.old/internal/pkg/log/rotate.go
similarity index 100%
rename from internal/pkg/log/rotate.go
rename to .old/internal/pkg/log/rotate.go
diff --git a/internal/pkg/log/withfields.go b/.old/internal/pkg/log/withfields.go
similarity index 100%
rename from internal/pkg/log/withfields.go
rename to .old/internal/pkg/log/withfields.go
diff --git a/internal/pkg/log/writer.go b/.old/internal/pkg/log/writer.go
similarity index 100%
rename from internal/pkg/log/writer.go
rename to .old/internal/pkg/log/writer.go
diff --git a/internal/pkg/queue/access.go b/.old/internal/pkg/queue/access.go
similarity index 100%
rename from internal/pkg/queue/access.go
rename to .old/internal/pkg/queue/access.go
diff --git a/internal/pkg/queue/access_test.go b/.old/internal/pkg/queue/access_test.go
similarity index 100%
rename from internal/pkg/queue/access_test.go
rename to .old/internal/pkg/queue/access_test.go
diff --git a/internal/pkg/queue/dequeue.go b/.old/internal/pkg/queue/dequeue.go
similarity index 100%
rename from internal/pkg/queue/dequeue.go
rename to .old/internal/pkg/queue/dequeue.go
diff --git a/internal/pkg/queue/dequeue_test.go b/.old/internal/pkg/queue/dequeue_test.go
similarity index 100%
rename from internal/pkg/queue/dequeue_test.go
rename to .old/internal/pkg/queue/dequeue_test.go
diff --git a/internal/pkg/queue/encoding.go b/.old/internal/pkg/queue/encoding.go
similarity index 100%
rename from internal/pkg/queue/encoding.go
rename to .old/internal/pkg/queue/encoding.go
diff --git a/internal/pkg/queue/encoding_test.go b/.old/internal/pkg/queue/encoding_test.go
similarity index 100%
rename from internal/pkg/queue/encoding_test.go
rename to .old/internal/pkg/queue/encoding_test.go
diff --git a/internal/pkg/queue/enqueue.go b/.old/internal/pkg/queue/enqueue.go
similarity index 100%
rename from internal/pkg/queue/enqueue.go
rename to .old/internal/pkg/queue/enqueue.go
diff --git a/internal/pkg/queue/enqueue_test.go b/.old/internal/pkg/queue/enqueue_test.go
similarity index 100%
rename from internal/pkg/queue/enqueue_test.go
rename to .old/internal/pkg/queue/enqueue_test.go
diff --git a/internal/pkg/queue/error.go b/.old/internal/pkg/queue/error.go
similarity index 100%
rename from internal/pkg/queue/error.go
rename to .old/internal/pkg/queue/error.go
diff --git a/internal/pkg/queue/handover.go b/.old/internal/pkg/queue/handover.go
similarity index 100%
rename from internal/pkg/queue/handover.go
rename to .old/internal/pkg/queue/handover.go
diff --git a/internal/pkg/queue/handover_test.go b/.old/internal/pkg/queue/handover_test.go
similarity index 100%
rename from internal/pkg/queue/handover_test.go
rename to .old/internal/pkg/queue/handover_test.go
diff --git a/internal/pkg/queue/index/encoding.go b/.old/internal/pkg/queue/index/encoding.go
similarity index 100%
rename from internal/pkg/queue/index/encoding.go
rename to .old/internal/pkg/queue/index/encoding.go
diff --git a/internal/pkg/queue/index/error.go b/.old/internal/pkg/queue/index/error.go
similarity index 100%
rename from internal/pkg/queue/index/error.go
rename to .old/internal/pkg/queue/index/error.go
diff --git a/internal/pkg/queue/index/file_io.go b/.old/internal/pkg/queue/index/file_io.go
similarity index 100%
rename from internal/pkg/queue/index/file_io.go
rename to .old/internal/pkg/queue/index/file_io.go
diff --git a/internal/pkg/queue/index/index.go b/.old/internal/pkg/queue/index/index.go
similarity index 100%
rename from internal/pkg/queue/index/index.go
rename to .old/internal/pkg/queue/index/index.go
diff --git a/internal/pkg/queue/index/manager.go b/.old/internal/pkg/queue/index/manager.go
similarity index 100%
rename from internal/pkg/queue/index/manager.go
rename to .old/internal/pkg/queue/index/manager.go
diff --git a/internal/pkg/queue/index/manager_test.go b/.old/internal/pkg/queue/index/manager_test.go
similarity index 100%
rename from internal/pkg/queue/index/manager_test.go
rename to .old/internal/pkg/queue/index/manager_test.go
diff --git a/internal/pkg/queue/index/recovery.go b/.old/internal/pkg/queue/index/recovery.go
similarity index 100%
rename from internal/pkg/queue/index/recovery.go
rename to .old/internal/pkg/queue/index/recovery.go
diff --git a/internal/pkg/queue/index/recovery_test.go b/.old/internal/pkg/queue/index/recovery_test.go
similarity index 100%
rename from internal/pkg/queue/index/recovery_test.go
rename to .old/internal/pkg/queue/index/recovery_test.go
diff --git a/internal/pkg/queue/index/wal.go b/.old/internal/pkg/queue/index/wal.go
similarity index 100%
rename from internal/pkg/queue/index/wal.go
rename to .old/internal/pkg/queue/index/wal.go
diff --git a/internal/pkg/queue/index/wal_test.go b/.old/internal/pkg/queue/index/wal_test.go
similarity index 100%
rename from internal/pkg/queue/index/wal_test.go
rename to .old/internal/pkg/queue/index/wal_test.go
diff --git a/internal/pkg/queue/item.go b/.old/internal/pkg/queue/item.go
similarity index 100%
rename from internal/pkg/queue/item.go
rename to .old/internal/pkg/queue/item.go
diff --git a/internal/pkg/queue/item_test.go b/.old/internal/pkg/queue/item_test.go
similarity index 100%
rename from internal/pkg/queue/item_test.go
rename to .old/internal/pkg/queue/item_test.go
diff --git a/internal/pkg/queue/metadata.go b/.old/internal/pkg/queue/metadata.go
similarity index 100%
rename from internal/pkg/queue/metadata.go
rename to .old/internal/pkg/queue/metadata.go
diff --git a/internal/pkg/queue/protobuf/v1/item.pb.go b/.old/internal/pkg/queue/protobuf/v1/item.pb.go
similarity index 100%
rename from internal/pkg/queue/protobuf/v1/item.pb.go
rename to .old/internal/pkg/queue/protobuf/v1/item.pb.go
diff --git a/internal/pkg/queue/protobuf/v1/item.proto b/.old/internal/pkg/queue/protobuf/v1/item.proto
similarity index 100%
rename from internal/pkg/queue/protobuf/v1/item.proto
rename to .old/internal/pkg/queue/protobuf/v1/item.proto
diff --git a/internal/pkg/queue/queue.go b/.old/internal/pkg/queue/queue.go
similarity index 100%
rename from internal/pkg/queue/queue.go
rename to .old/internal/pkg/queue/queue.go
diff --git a/internal/pkg/queue/queue_test.go b/.old/internal/pkg/queue/queue_test.go
similarity index 100%
rename from internal/pkg/queue/queue_test.go
rename to .old/internal/pkg/queue/queue_test.go
diff --git a/internal/pkg/queue/stats.go b/.old/internal/pkg/queue/stats.go
similarity index 100%
rename from internal/pkg/queue/stats.go
rename to .old/internal/pkg/queue/stats.go
diff --git a/cmd/cmd.go b/cmd/cmd.go
index 62cd5526..2d606cf4 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -3,7 +3,7 @@ package cmd
import (
"fmt"
- "github.com/internetarchive/Zeno/config"
+ "github.com/internetarchive/Zeno/internal/pkg/config"
"github.com/spf13/cobra"
)
@@ -27,7 +27,7 @@ Authors:
return fmt.Errorf("error initializing config: %s", err)
}
- cfg = config.GetConfig()
+ cfg = config.Get()
return nil
},
Run: func(cmd *cobra.Command, args []string) {
@@ -43,6 +43,7 @@ func Run() error {
rootCmd.PersistentFlags().String("log-level", "info", "stdout log level (debug, info, warn, error)")
rootCmd.PersistentFlags().String("config-file", "", "config file (default is $HOME/zeno-config.yaml)")
rootCmd.PersistentFlags().Bool("no-stdout-log", false, "disable stdout logging.")
+ rootCmd.PersistentFlags().Bool("no-stderr-log", false, "disable stderr logging.")
rootCmd.PersistentFlags().Bool("consul-config", false, "Use this flag to enable consul config support")
rootCmd.PersistentFlags().String("consul-address", "", "The consul address used to retreive config")
rootCmd.PersistentFlags().String("consul-path", "", "The full Consul K/V path where the config is stored")
diff --git a/cmd/get.go b/cmd/get.go
index fc273e2c..33893484 100644
--- a/cmd/get.go
+++ b/cmd/get.go
@@ -20,41 +20,39 @@ func getCMDs() *cobra.Command {
getCmd.AddCommand(getURLCmd)
getCmd.AddCommand(getHQCmd)
- getCmd.AddCommand(getListCmd)
return getCmd
}
func getCMDsFlags(getCmd *cobra.Command) {
- getCmd.PersistentFlags().String("user-agent", "Zeno", "User agent to use when requesting URLs.")
+ getCmd.PersistentFlags().String("user-agent", "", "User agent to use when requesting URLs.")
getCmd.PersistentFlags().String("job", "", "Job name to use, will determine the path for the persistent queue, seencheck database, and WARC files.")
getCmd.PersistentFlags().IntP("workers", "w", 1, "Number of concurrent workers to run.")
- getCmd.PersistentFlags().Int("max-concurrent-assets", 8, "Max number of concurrent assets to fetch PER worker. E.g. if you have 100 workers and this setting at 8, Zeno could do up to 800 concurrent requests at any time.")
- getCmd.PersistentFlags().Uint8("max-hops", 0, "Maximum number of hops to execute.")
+ getCmd.PersistentFlags().Int("max-concurrent-assets", 1, "Max number of concurrent assets to fetch PER worker. E.g. if you have 100 workers and this setting at 8, Zeno could do up to 800 concurrent requests at any time.")
+ getCmd.PersistentFlags().Int("max-hops", 0, "Maximum number of hops to execute.")
getCmd.PersistentFlags().String("cookies", "", "File containing cookies that will be used for requests.")
getCmd.PersistentFlags().Bool("keep-cookies", false, "Keep a global cookie jar")
getCmd.PersistentFlags().Bool("headless", false, "Use headless browsers instead of standard GET requests.")
getCmd.PersistentFlags().Bool("disable-seencheck", false, "Disable the (remote or local) seencheck that avoid re-crawling of URIs.")
getCmd.PersistentFlags().Bool("json", false, "Output logs in JSON")
- getCmd.PersistentFlags().Bool("debug", false, "")
getCmd.PersistentFlags().Bool("api", false, "Enable API")
getCmd.PersistentFlags().String("api-port", "9443", "Port to listen on for the API.")
getCmd.PersistentFlags().Bool("prometheus", false, "Export metrics in Prometheus format. (implies --api)")
getCmd.PersistentFlags().String("prometheus-prefix", "zeno:", "String used as a prefix for the exported Prometheus metrics.")
getCmd.PersistentFlags().Int("max-redirect", 20, "Specifies the maximum number of redirections to follow for a resource.")
getCmd.PersistentFlags().Int("max-retry", 5, "Number of retry if error happen when executing HTTP request.")
- getCmd.PersistentFlags().Int("http-timeout", -1, "Number of seconds to wait before timing out a request.")
- getCmd.PersistentFlags().Bool("domains-crawl", false, "If this is turned on, seeds will be treated as domains to crawl, therefore same-domain outlinks will be added to the queue as hop=0.")
+ getCmd.PersistentFlags().Int("http-timeout", -1, "Number of seconds to wait before timing out a request. Note: this will CANCEL large files download.")
+ getCmd.PersistentFlags().Int("http-read-deadline", 60, "Number of seconds to wait before timing out a (blocking) read.")
+ getCmd.PersistentFlags().StringSlice("domains-crawl", []string{}, "Naive domains, full URLs or regexp to match against any URL to determine hop behaviour for outlinks. If an outlink URL is matched it will be queued to crawl with a hop of 0. This flag helps crawling entire domains while doing non-focused crawls.")
getCmd.PersistentFlags().StringSlice("disable-html-tag", []string{}, "Specify HTML tag to not extract assets from")
getCmd.PersistentFlags().Bool("capture-alternate-pages", false, "If turned on, HTML tags with \"alternate\" values for their \"rel\" attribute will be archived.")
getCmd.PersistentFlags().StringSlice("exclude-host", []string{}, "Exclude a specific host from the crawl, note that it will not exclude the domain if it is encountered as an asset for another web page.")
getCmd.PersistentFlags().StringSlice("include-host", []string{}, "Only crawl specific hosts, note that it will not include the domain if it is encountered as an asset for another web page.")
getCmd.PersistentFlags().StringSlice("include-string", []string{}, "Only crawl URLs containing this string.")
- getCmd.PersistentFlags().Int("max-concurrent-per-domain", 16, "Maximum number of concurrent requests per domain.")
- getCmd.PersistentFlags().Int("concurrent-sleep-length", 500, "Number of milliseconds to sleep when max concurrency per domain is reached.")
getCmd.PersistentFlags().Int("crawl-time-limit", 0, "Number of seconds until the crawl will automatically set itself into the finished state.")
getCmd.PersistentFlags().Int("crawl-max-time-limit", 0, "Number of seconds until the crawl will automatically panic itself. Default to crawl-time-limit + (crawl-time-limit / 10)")
getCmd.PersistentFlags().StringSlice("exclude-string", []string{}, "Discard any (discovered) URLs containing this string.")
+ getCmd.PersistentFlags().StringSlice("exclusion-file", []string{}, "File containing regex to apply on URLs for exclusion. If the path start with http or https, it will be treated as a URL of a file to download.")
getCmd.PersistentFlags().Int("min-space-required", 20, "Minimum space required in GB to continue the crawl.")
getCmd.PersistentFlags().Bool("handover", false, "Use the handover mechanism that dispatch URLs via a buffer before enqueuing on disk. (UNSTABLE)")
getCmd.PersistentFlags().Bool("ultrasafe-queue", false, "Don't use committed batch writes to the WAL and instead fsync() after each write.")
@@ -74,32 +72,43 @@ func getCMDsFlags(getCmd *cobra.Command) {
getCmd.PersistentFlags().Bool("warc-on-disk", false, "Do not use RAM to store payloads when recording traffic to WARCs, everything will happen on disk (usually used to reduce memory usage).")
getCmd.PersistentFlags().Int("warc-pool-size", 1, "Number of concurrent WARC files to write.")
getCmd.PersistentFlags().String("warc-temp-dir", "", "Custom directory to use for WARC temporary files.")
- getCmd.PersistentFlags().Bool("disable-local-dedupe", false, "Disable local URL agonistic deduplication.")
+ getCmd.PersistentFlags().Bool("disable-local-dedupe", false, "Disable local URL agnostic deduplication.")
getCmd.PersistentFlags().Bool("cert-validation", false, "Enables certificate validation on HTTPS requests.")
getCmd.PersistentFlags().Bool("disable-assets-capture", false, "Disable assets capture.")
getCmd.PersistentFlags().Int("warc-dedupe-size", 1024, "Minimum size to deduplicate WARC records with revisit records.")
- getCmd.PersistentFlags().String("cdx-cookie", "", "Pass custom cookie during CDX requests. Example: 'cdx_auth_token=test_value'")
+ getCmd.PersistentFlags().String("warc-cdx-cookie", "", "Pass custom cookie during CDX requests. Example: 'cdx_auth_token=test_value'")
getCmd.PersistentFlags().Int("warc-size", 1024, "Size of the WARC files in MB.")
// Logging flags
- getCmd.PersistentFlags().Bool("live-stats", false, "Enable live stats but disable logging. (implies --no-stdout-log)")
+ getCmd.PersistentFlags().Bool("tui", false, "Display a terminal user interface.")
+ getCmd.PersistentFlags().String("tui-log-level", "info", "Log level for the TUI.")
+ getCmd.PersistentFlags().Bool("no-log-file", false, "Disable log file output.")
getCmd.PersistentFlags().String("log-file-output-dir", "", "Directory to write log files to.")
- getCmd.PersistentFlags().String("es-url", "", "comma-separated ElasticSearch URL to use for indexing crawl logs.")
- getCmd.PersistentFlags().String("es-user", "", "ElasticSearch username to use for indexing crawl logs.")
- getCmd.PersistentFlags().String("es-password", "", "ElasticSearch password to use for indexing crawl logs.")
- getCmd.PersistentFlags().String("es-index-prefix", "zeno", "ElasticSearch index prefix to use for indexing crawl logs. Default is : `zeno`, without `-`")
+ getCmd.PersistentFlags().String("log-file-prefix", "ZENO", "Prefix to use when naming the log files. Default is : `ZENO`, without '-'")
+ getCmd.PersistentFlags().String("log-file-level", "info", "Log level for the log file.")
+ getCmd.PersistentFlags().String("log-file-rotation", "1h", "Log file rotation period. Default is : `1h`. Valid time units are 'ns', 'us' (or 'µs'), 'ms', 's', 'm', 'h'.")
+ getCmd.PersistentFlags().String("log-es-urls", "", "(Not implemented) Comma-separated ElasticSearch URL to use for indexing crawl logs.")
+ getCmd.PersistentFlags().String("log-es-user", "", "(Not implemented) ElasticSearch username to use for indexing crawl logs.")
+ getCmd.PersistentFlags().String("log-es-password", "", "(Not implemented) ElasticSearch password to use for indexing crawl logs.")
+ getCmd.PersistentFlags().String("log-es-index-prefix", "zeno", "(Not implemented) ElasticSearch index prefix to use for indexing crawl logs. Default is : `zeno`, without `-`")
+ getCmd.PersistentFlags().String("log-es-rotation", "1d", "(Not implemented) ElasticSearch index rotation period. Default is : `1d`. Valid time units are 'ns', 'us' (or 'µs'), 'ms', 's', 'm', 'h'.")
+ // log-level is defined in the root command
+ // no-stdout-log is defined in the root command
// Dependencies flags
getCmd.PersistentFlags().Bool("no-ytdlp", false, "Disable youtube-dlp usage for video extraction.")
getCmd.PersistentFlags().String("ytdlp-path", "", "Path to youtube-dlp binary.")
+ // Profiling flags
+ getCmd.PersistentFlags().String("pyroscope-address", "", "Pyroscope server address. Setting this flag will enable profiling.")
+
// Alias support
// As cobra doesn't support aliases natively (couldn't find a way to do it), we have to do it manually
// This is a workaround to allow users to use `--hops` instead of `--max-hops` for example
// Aliases shouldn't be used as proper flags nor declared in the config struct
// Aliases should be marked as deprecated to inform the user base
// Aliases values should be copied to the proper flag in the config/config.go:handleFlagsAliases() function
- getCmd.PersistentFlags().Uint8("hops", 0, "Maximum number of hops to execute.")
+ getCmd.PersistentFlags().Int("hops", 0, "Maximum number of hops to execute.")
getCmd.PersistentFlags().MarkDeprecated("hops", "use --max-hops instead")
getCmd.PersistentFlags().MarkHidden("hops")
diff --git a/cmd/get_hq.go b/cmd/get_hq.go
index 3b3ad02d..2c6fe45f 100644
--- a/cmd/get_hq.go
+++ b/cmd/get_hq.go
@@ -2,46 +2,86 @@ package cmd
import (
"fmt"
+ "os"
+ "runtime"
+ "time"
- "github.com/internetarchive/Zeno/internal/pkg/crawl"
+ "github.com/google/uuid"
+ "github.com/grafana/pyroscope-go"
+ "github.com/internetarchive/Zeno/internal/pkg/config"
+ "github.com/internetarchive/Zeno/internal/pkg/controler"
+ "github.com/internetarchive/Zeno/internal/pkg/ui"
+ "github.com/internetarchive/Zeno/internal/pkg/utils"
"github.com/spf13/cobra"
)
var getHQCmd = &cobra.Command{
Use: "hq",
Short: "Start crawling with the crawl HQ connector.",
- PreRunE: func(cmd *cobra.Command, args []string) error {
+ PreRunE: func(_ *cobra.Command, _ []string) error {
if cfg == nil {
return fmt.Errorf("viper config is nil")
}
- cfg.HQ = true
- return nil
- },
- RunE: func(cmd *cobra.Command, args []string) error {
- // Init crawl using the flags provided
- crawl, err := crawl.GenerateCrawlConfig(cfg)
+
+ err := config.GenerateCrawlConfig()
if err != nil {
- if crawl != nil && crawl.Log != nil {
- crawl.Log.WithFields(map[string]interface{}{
- "crawl": crawl,
- "err": err.Error(),
- }).Error("'get hq' exited due to error")
- }
return err
}
- // start crawl
- err = crawl.Start()
- if err != nil {
- crawl.Log.WithFields(map[string]interface{}{
- "crawl": crawl,
- "err": err.Error(),
- }).Error("'get hq' Crawl() exited due to error")
- return err
+ cfg.UseHQ = true
+
+ if cfg.PyroscopeAddress != "" {
+ runtime.SetMutexProfileFraction(5)
+ runtime.SetBlockProfileRate(5)
+
+ // Get the hostname via env or via command
+ hostname, err := os.Hostname()
+ if err != nil {
+ return fmt.Errorf("error getting hostname for Pyroscope: %w", err)
+ }
+
+ Version := utils.GetVersion()
+
+ _, err = pyroscope.Start(pyroscope.Config{
+ ApplicationName: fmt.Sprintf("zeno"),
+ ServerAddress: cfg.PyroscopeAddress,
+ Logger: nil,
+ Tags: map[string]string{"hostname": hostname, "job": cfg.Job, "version": Version.Version, "goVersion": Version.GoVersion, "uuid": uuid.New().String()[:5]},
+ UploadRate: 15 * time.Second,
+ ProfileTypes: []pyroscope.ProfileType{
+ pyroscope.ProfileCPU,
+ pyroscope.ProfileAllocObjects,
+ pyroscope.ProfileAllocSpace,
+ pyroscope.ProfileInuseObjects,
+ pyroscope.ProfileInuseSpace,
+ pyroscope.ProfileGoroutines,
+ pyroscope.ProfileMutexCount,
+ pyroscope.ProfileMutexDuration,
+ pyroscope.ProfileBlockCount,
+ pyroscope.ProfileBlockDuration,
+ },
+ })
+
+ if err != nil {
+ panic(fmt.Errorf("error starting pyroscope: %w", err))
+ }
}
return nil
},
+ RunE: func(_ *cobra.Command, _ []string) error {
+ controler.Start()
+ if config.Get().TUI {
+ tui := ui.New()
+ err := tui.Start()
+ if err != nil {
+ return fmt.Errorf("error starting TUI: %w", err)
+ }
+ } else {
+ controler.WatchSignals()
+ }
+ return nil
+ },
}
func getHQCmdFlags(getHQCmd *cobra.Command) {
@@ -52,8 +92,8 @@ func getHQCmdFlags(getHQCmd *cobra.Command) {
getHQCmd.PersistentFlags().String("hq-project", "", "Crawl HQ project.")
getHQCmd.PersistentFlags().Bool("hq-continuous-pull", false, "If turned on, the crawler will pull URLs from Crawl HQ continuously.")
getHQCmd.PersistentFlags().String("hq-strategy", "lifo", "Crawl HQ feeding strategy.")
- getHQCmd.PersistentFlags().Int64("hq-batch-size", 0, "Crawl HQ feeding batch size.")
- getHQCmd.PersistentFlags().Int64("hq-batch-concurrency", 1, "Number of concurrent requests to do to get the --hq-batch-size, if batch size is 300 and batch-concurrency is 10, 30 requests will be done concurrently.")
+ getHQCmd.PersistentFlags().Int("hq-batch-size", 500, "Crawl HQ feeding batch size.")
+ getHQCmd.PersistentFlags().Int("hq-batch-concurrency", 1, "Number of concurrent requests to do to get the --hq-batch-size, if batch size is 300 and batch-concurrency is 10, 30 requests will be done concurrently.")
getHQCmd.PersistentFlags().Bool("hq-rate-limiting-send-back", false, "If turned on, the crawler will send back URLs that hit a rate limit to crawl HQ.")
getHQCmd.MarkPersistentFlagRequired("hq-address")
diff --git a/cmd/get_list.go b/cmd/get_list.go
deleted file mode 100644
index 1f935b7e..00000000
--- a/cmd/get_list.go
+++ /dev/null
@@ -1,61 +0,0 @@
-package cmd
-
-import (
- "fmt"
-
- "github.com/internetarchive/Zeno/internal/pkg/crawl"
- "github.com/internetarchive/Zeno/internal/pkg/queue"
- "github.com/spf13/cobra"
-)
-
-var getListCmd = &cobra.Command{
- Use: "list [FILE]",
- Short: "Start crawling with a seed list",
- Args: cobra.ExactArgs(1),
- PreRunE: func(cmd *cobra.Command, args []string) error {
- if cfg == nil {
- return fmt.Errorf("viper config is nil")
- }
- return nil
- },
- RunE: func(cmd *cobra.Command, args []string) error {
- // Init crawl using the flags provided
- crawl, err := crawl.GenerateCrawlConfig(cfg)
- if err != nil {
- if crawl != nil && crawl.Log != nil {
- crawl.Log.WithFields(map[string]interface{}{
- "crawl": crawl,
- "err": err.Error(),
- }).Error("'get hq' exited due to error")
- }
- return err
- }
-
- // Initialize initial seed list
- crawl.SeedList, err = queue.FileToItems(args[0])
- if err != nil || len(crawl.SeedList) <= 0 {
- crawl.Log.WithFields(map[string]interface{}{
- "input": args[0],
- "err": err.Error(),
- }).Error("This is not a valid input")
- return err
- }
-
- crawl.Log.WithFields(map[string]interface{}{
- "input": args[0],
- "seedsCount": len(crawl.SeedList),
- }).Info("Seed list loaded")
-
- // Start crawl
- err = crawl.Start()
- if err != nil {
- crawl.Log.WithFields(map[string]interface{}{
- "crawl": crawl,
- "err": err.Error(),
- }).Error("Crawl exited due to error")
- return err
- }
-
- return nil
- },
-}
diff --git a/cmd/get_url.go b/cmd/get_url.go
index 7212cd68..1ed3202f 100644
--- a/cmd/get_url.go
+++ b/cmd/get_url.go
@@ -2,10 +2,9 @@ package cmd
import (
"fmt"
- "net/url"
- "github.com/internetarchive/Zeno/internal/pkg/crawl"
- "github.com/internetarchive/Zeno/internal/pkg/queue"
+ "github.com/internetarchive/Zeno/internal/pkg/config"
+ "github.com/internetarchive/Zeno/internal/pkg/controler"
"github.com/spf13/cobra"
)
@@ -13,58 +12,29 @@ var getURLCmd = &cobra.Command{
Use: "url [URL...]",
Short: "Archive given URLs",
Args: cobra.MinimumNArgs(1),
- PreRunE: func(cmd *cobra.Command, args []string) error {
+ PreRunE: func(_ *cobra.Command, args []string) error {
if cfg == nil {
return fmt.Errorf("viper config is nil")
}
- return nil
- },
- RunE: func(cmd *cobra.Command, args []string) error {
- // Init crawl using the flags provided
- crawl, err := crawl.GenerateCrawlConfig(cfg)
- if err != nil {
- if crawl != nil && crawl.Log != nil {
- crawl.Log.WithFields(map[string]interface{}{
- "crawl": crawl,
- "err": err.Error(),
- }).Error("'get url' exited due to error")
- }
- return err
- }
- // Initialize initial seed list
- for _, arg := range args {
- input, err := url.Parse(arg)
- if err != nil {
- crawl.Log.WithFields(map[string]interface{}{
- "input_url": arg,
- "err": err.Error(),
- }).Error("given URL is not a valid input")
- return err
- }
+ if len(args) == 0 {
+ return fmt.Errorf("no URLs provided")
+ }
- item, err := queue.NewItem(input, nil, "seed", 0, "", false)
- if err != nil {
- crawl.Log.WithFields(map[string]interface{}{
- "input_url": arg,
- "err": err.Error(),
- }).Error("Failed to create new item")
- return err
- }
- crawl.SeedList = append(crawl.SeedList, *item)
+ return nil
+ },
+ RunE: func(_ *cobra.Command, args []string) error {
+ for _, URL := range args {
+ config.Get().InputSeeds = append(config.Get().InputSeeds, URL)
}
- // Start crawl
- err = crawl.Start()
+ err := config.GenerateCrawlConfig()
if err != nil {
- crawl.Log.WithFields(map[string]interface{}{
- "crawl": crawl,
- "err": err.Error(),
- }).Error("'get url' Crawl() exited due to error")
return err
}
- crawl.Log.Info("Crawl finished")
- return err
+ controler.Start()
+ controler.WatchSignals()
+ return nil
},
}
diff --git a/config/config.go b/config/config.go
deleted file mode 100644
index 916d8fc2..00000000
--- a/config/config.go
+++ /dev/null
@@ -1,204 +0,0 @@
-package config
-
-import (
- "fmt"
- "net/url"
- "os"
- "path/filepath"
- "strings"
- "sync"
-
- "github.com/spf13/pflag"
- "github.com/spf13/viper"
-)
-
-// Config holds all configuration for our program
-type Config struct {
- LogLevel string `mapstructure:"log-level"`
- UserAgent string `mapstructure:"user-agent"`
- Job string `mapstructure:"job"`
- Cookies string `mapstructure:"cookies"`
- APIPort string `mapstructure:"api-port"`
- PrometheusPrefix string `mapstructure:"prometheus-prefix"`
- WARCPrefix string `mapstructure:"warc-prefix"`
- WARCOperator string `mapstructure:"warc-operator"`
- CDXDedupeServer string `mapstructure:"warc-cdx-dedupe-server"`
- WARCTempDir string `mapstructure:"warc-temp-dir"`
- WARCSize int `mapstructure:"warc-size"`
- CDXCookie string `mapstructure:"cdx-cookie"`
- HQAddress string `mapstructure:"hq-address"`
- HQKey string `mapstructure:"hq-key"`
- HQSecret string `mapstructure:"hq-secret"`
- HQProject string `mapstructure:"hq-project"`
- HQStrategy string `mapstructure:"hq-strategy"`
- HQBatchSize int64 `mapstructure:"hq-batch-size"`
- HQBatchConcurrency int `mapstructure:"hq-batch-concurrency"`
- LogFileOutputDir string `mapstructure:"log-file-output-dir"`
- ElasticSearchUsername string `mapstructure:"es-user"`
- ElasticSearchPassword string `mapstructure:"es-password"`
- ElasticSearchIndexPrefix string `mapstructure:"es-index-prefix"`
- DisableHTMLTag []string `mapstructure:"disable-html-tag"`
- ExcludeHosts []string `mapstructure:"exclude-host"`
- IncludeHosts []string `mapstructure:"include-host"`
- IncludeString []string `mapstructure:"include-string"`
- ExcludeString []string `mapstructure:"exclude-string"`
- ElasticSearchURLs []string `mapstructure:"es-url"`
- WorkersCount int `mapstructure:"workers"`
- MaxConcurrentAssets int `mapstructure:"max-concurrent-assets"`
- MaxHops uint8 `mapstructure:"max-hops"`
- MaxRedirect uint8 `mapstructure:"max-redirect"`
- MaxRetry uint8 `mapstructure:"max-retry"`
- HTTPTimeout int `mapstructure:"http-timeout"`
- MaxConcurrentRequestsPerDomain int `mapstructure:"max-concurrent-per-domain"`
- ConcurrentSleepLength int `mapstructure:"concurrent-sleep-length"`
- CrawlTimeLimit int `mapstructure:"crawl-time-limit"`
- CrawlMaxTimeLimit int `mapstructure:"crawl-max-time-limit"`
- MinSpaceRequired int `mapstructure:"min-space-required"`
- WARCPoolSize int `mapstructure:"warc-pool-size"`
- WARCDedupeSize int `mapstructure:"warc-dedupe-size"`
- KeepCookies bool `mapstructure:"keep-cookies"`
- Headless bool `mapstructure:"headless"`
- DisableSeencheck bool `mapstructure:"disable-seencheck"`
- JSON bool `mapstructure:"json"`
- Debug bool `mapstructure:"debug"`
- LiveStats bool `mapstructure:"live-stats"`
- API bool `mapstructure:"api"`
- Prometheus bool `mapstructure:"prometheus"`
- DomainsCrawl bool `mapstructure:"domains-crawl"`
- CaptureAlternatePages bool `mapstructure:"capture-alternate-pages"`
- WARCOnDisk bool `mapstructure:"warc-on-disk"`
- DisableLocalDedupe bool `mapstructure:"disable-local-dedupe"`
- CertValidation bool `mapstructure:"cert-validation"`
- DisableAssetsCapture bool `mapstructure:"disable-assets-capture"`
- HQ bool // Special field to check if HQ is enabled depending on the command called
- HQContinuousPull bool `mapstructure:"hq-continuous-pull"`
- HQRateLimitSendBack bool `mapstructure:"hq-rate-limiting-send-back"`
- NoStdoutLogging bool `mapstructure:"no-stdout-log"`
- NoBatchWriteWAL bool `mapstructure:"ultrasafe-queue"`
- Handover bool `mapstructure:"handover"`
-
- // Network
- Proxy string `mapstructure:"proxy"`
- DomainsBypassProxy []string `mapstructure:"bypass-proxy"`
- RandomLocalIP bool `mapstructure:"random-local-ip"`
- DisableIPv4 bool `mapstructure:"disable-ipv4"`
- DisableIPv6 bool `mapstructure:"disable-ipv6"`
- IPv6AnyIP bool `mapstructure:"ipv6-anyip"`
-
- // Dependencies
- NoYTDLP bool `mapstructure:"no-ytdlp"`
- YTDLPPath string `mapstructure:"ytdlp-path"`
-}
-
-var (
- config *Config
- once sync.Once
-)
-
-// InitConfig initializes the configuration
-// Flags -> Env -> Config file -> Consul config
-// Latest has precedence over the rest
-func InitConfig() error {
- var err error
- once.Do(func() {
- config = &Config{}
-
- // Check if a config file is provided via flag
- if configFile := viper.GetString("config-file"); configFile != "" {
- viper.SetConfigFile(configFile)
- } else {
- home, err := os.UserHomeDir()
- if err != nil {
- fmt.Println(err)
- os.Exit(1)
- }
-
- viper.AddConfigPath(home)
- viper.SetConfigType("yaml")
- viper.SetConfigName("zeno-config")
- }
-
- viper.SetEnvPrefix("ZENO")
- replacer := strings.NewReplacer("-", "_", ".", "_")
- viper.SetEnvKeyReplacer(replacer)
- viper.AutomaticEnv()
-
- if err = viper.ReadInConfig(); err == nil {
- fmt.Println("Using config file:", viper.ConfigFileUsed())
- }
-
- if viper.GetBool("consul-config") && viper.GetString("consul-address") != "" {
- var consulAddress *url.URL
- consulAddress, err = url.Parse(viper.GetString("consul-address"))
- if err != nil {
- return
- }
-
- consulPath, consulFile := filepath.Split(viper.GetString("consul-path"))
- viper.AddRemoteProvider("consul", consulAddress.String(), consulPath)
- viper.SetConfigType(filepath.Ext(consulFile))
- viper.SetConfigName(strings.TrimSuffix(consulFile, filepath.Ext(consulFile)))
-
- if err = viper.ReadInConfig(); err == nil {
- fmt.Println("Using config file:", viper.ConfigFileUsed())
- }
- }
-
- // This function is used to bring logic to the flags when needed (e.g. live-stats)
- handleFlagsEdgeCases()
-
- // This function is used to handle flags aliases (e.g. hops -> max-hops)
- handleFlagsAliases()
-
- // Unmarshal the config into the Config struct
- err = viper.Unmarshal(config)
- })
- return err
-}
-
-// BindFlags binds the flags to the viper configuration
-// This is needed because viper doesn't support same flag name accross multiple commands
-// Details here: https://github.com/spf13/viper/issues/375#issuecomment-794668149
-func BindFlags(flagSet *pflag.FlagSet) {
- flagSet.VisitAll(func(flag *pflag.Flag) {
- viper.BindPFlag(flag.Name, flag)
- })
-}
-
-// GetConfig returns the config struct
-func GetConfig() *Config {
- cfg := config
- if cfg == nil {
- panic("Config not initialized. Call InitConfig() before accessing the config.")
- }
- return cfg
-}
-
-func handleFlagsEdgeCases() {
- if viper.GetBool("live-stats") {
- // If live-stats is true, set no-stdout-log to true
- viper.Set("no-stdout-log", true)
- }
-
- if viper.GetBool("prometheus") {
- // If prometheus is true, set no-stdout-log to true
- viper.Set("api", true)
- }
-}
-
-func handleFlagsAliases() {
- // For each flag we want to alias, we check if the original flag is at default and if the alias is not
- // If so, we set the original flag to the value of the alias
-
- if viper.GetUint("hops") != 0 && viper.GetUint("max-hops") == 0 {
- viper.Set("max-hops", viper.GetUint("hops"))
- }
-
- if viper.GetInt("ca") != 8 && viper.GetInt("max-concurrent-assets") == 8 {
- viper.Set("max-concurrent-assets", viper.GetInt("ca"))
- }
-
- if viper.GetInt("msr") != 20 && viper.GetInt("min-space-required") == 20 {
- viper.Set("min-space-required", viper.GetInt("msr"))
- }
-}
diff --git a/go.mod b/go.mod
index 1200c6fa..8830ce84 100644
--- a/go.mod
+++ b/go.mod
@@ -1,95 +1,80 @@
module github.com/internetarchive/Zeno
-go 1.22.4
+go 1.24
+
+toolchain go1.24.0
require (
- github.com/CorentinB/warc v0.8.53
- github.com/PuerkitoBio/goquery v1.9.3
- github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2
- github.com/clbanning/mxj/v2 v2.7.0
+ github.com/CorentinB/warc v0.8.71
+ github.com/ImVexed/fasturl v0.0.0-20230304231329-4e41488060f3
+ github.com/PuerkitoBio/goquery v1.10.1
+ github.com/ada-url/goada v0.0.0-20250104020233-00cbf4dc9da1
+ github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc
github.com/dustin/go-humanize v1.0.1
- github.com/elastic/go-elasticsearch/v8 v8.15.0
+ github.com/gabriel-vasile/mimetype v1.4.8
+ github.com/gdamore/tcell/v2 v2.8.1
github.com/google/uuid v1.6.0
- github.com/gosuri/uilive v0.0.4
- github.com/gosuri/uitable v0.0.4
- github.com/grafov/m3u8 v0.12.0
- github.com/internetarchive/gocrawlhq v1.2.20
- github.com/paulbellamy/ratecounter v0.2.0
+ github.com/grafana/pyroscope-go v1.2.0
+ github.com/grafov/m3u8 v0.12.1
+ github.com/internetarchive/gocrawlhq v1.2.28
github.com/philippgille/gokv/leveldb v0.7.0
- github.com/prometheus/client_golang v1.20.4
- github.com/remeh/sizedwaitgroup v1.0.0
- github.com/sirupsen/logrus v1.9.3
- github.com/spf13/cobra v1.8.1
- github.com/spf13/pflag v1.0.5
+ github.com/rivo/tview v0.0.0-20241227133733-17b7edb88c57
+ github.com/samber/slog-multi v1.4.0
+ github.com/spf13/cobra v1.9.1
+ github.com/spf13/pflag v1.0.6
github.com/spf13/viper v1.19.0
- github.com/telanflow/cookiejar v0.0.0-20190719062046-114449e86aa5
go.uber.org/goleak v1.3.0
- golang.org/x/net v0.29.0
- google.golang.org/protobuf v1.34.2
- mvdan.cc/xurls/v2 v2.5.0
+ golang.org/x/net v0.35.0
+ mvdan.cc/xurls/v2 v2.6.0
)
require (
- github.com/andybalholm/brotli v1.1.0 // indirect
- github.com/andybalholm/cascadia v1.3.2 // indirect
- github.com/aws/aws-sdk-go v1.55.5 // indirect
- github.com/beorn7/perks v1.0.1 // indirect
- github.com/cespare/xxhash/v2 v2.3.0 // indirect
- github.com/cloudflare/circl v1.4.0 // indirect
- github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
- github.com/elastic/elastic-transport-go/v8 v8.6.0 // indirect
- github.com/fatih/color v1.17.0 // indirect
- github.com/fsnotify/fsnotify v1.7.0 // indirect
- github.com/go-logr/logr v1.4.2 // indirect
- github.com/go-logr/stdr v1.2.2 // indirect
+ github.com/andybalholm/brotli v1.1.1 // indirect
+ github.com/andybalholm/cascadia v1.3.3 // indirect
+ github.com/cloudflare/circl v1.6.0 // indirect
+ github.com/dolthub/maphash v0.1.0 // indirect
+ github.com/fsnotify/fsnotify v1.8.0 // indirect
+ github.com/gammazero/deque v1.0.0 // indirect
+ github.com/gdamore/encoding v1.0.1 // indirect
github.com/gobwas/httphead v0.1.0 // indirect
github.com/gobwas/pool v0.2.1 // indirect
github.com/gobwas/ws v1.4.0 // indirect
github.com/golang/snappy v0.0.4 // indirect
- github.com/gomodule/redigo v1.9.2 // indirect
- github.com/google/go-cmp v0.6.0 // indirect
+ github.com/grafana/pyroscope-go/godeltaprof v0.1.8 // indirect
github.com/hashicorp/hcl v1.0.0 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
- github.com/jmespath/go-jmespath v0.4.0 // indirect
- github.com/json-iterator/go v1.1.12 // indirect
- github.com/klauspost/compress v1.17.10 // indirect
- github.com/magiconair/properties v1.8.7 // indirect
- github.com/mattn/go-colorable v0.1.13 // indirect
- github.com/mattn/go-isatty v0.0.20 // indirect
+ github.com/klauspost/compress v1.18.0 // indirect
+ github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
+ github.com/magiconair/properties v1.8.9 // indirect
github.com/mattn/go-runewidth v0.0.16 // indirect
- github.com/miekg/dns v1.1.62 // indirect
+ github.com/maypok86/otter v1.2.4 // indirect
+ github.com/miekg/dns v1.1.63 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect
- github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
- github.com/modern-go/reflect2 v1.0.2 // indirect
- github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/onsi/gomega v1.34.2 // indirect
+ github.com/paulbellamy/ratecounter v0.2.0 // indirect
github.com/pelletier/go-toml/v2 v2.2.3 // indirect
github.com/philippgille/gokv/encoding v0.7.0 // indirect
github.com/philippgille/gokv/util v0.7.0 // indirect
- github.com/prometheus/client_model v0.6.1 // indirect
- github.com/prometheus/common v0.59.1 // indirect
- github.com/prometheus/procfs v0.15.1 // indirect
github.com/refraction-networking/utls v1.6.7 // indirect
github.com/rivo/uniseg v0.4.7 // indirect
- github.com/sagikazarmark/locafero v0.6.0 // indirect
+ github.com/sagikazarmark/locafero v0.7.0 // indirect
github.com/sagikazarmark/slog-shim v0.1.0 // indirect
+ github.com/samber/lo v1.49.1 // indirect
github.com/sourcegraph/conc v0.3.0 // indirect
- github.com/spf13/afero v1.11.0 // indirect
- github.com/spf13/cast v1.7.0 // indirect
+ github.com/spf13/afero v1.12.0 // indirect
+ github.com/spf13/cast v1.7.1 // indirect
github.com/subosito/gotenv v1.6.0 // indirect
github.com/syndtr/goleveldb v1.0.0 // indirect
github.com/ulikunitz/xz v0.5.12 // indirect
- go.opentelemetry.io/otel v1.30.0 // indirect
- go.opentelemetry.io/otel/metric v1.30.0 // indirect
- go.opentelemetry.io/otel/trace v1.30.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
- golang.org/x/crypto v0.27.0 // indirect
- golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 // indirect
- golang.org/x/mod v0.21.0 // indirect
- golang.org/x/sync v0.8.0 // indirect
- golang.org/x/sys v0.26.0 // indirect
- golang.org/x/text v0.18.0 // indirect
- golang.org/x/tools v0.25.0 // indirect
+ golang.org/x/crypto v0.33.0 // indirect
+ golang.org/x/exp v0.0.0-20250106191152-7588d65b2ba8 // indirect
+ golang.org/x/mod v0.23.0 // indirect
+ golang.org/x/sync v0.11.0 // indirect
+ golang.org/x/sys v0.30.0 // indirect
+ golang.org/x/term v0.29.0 // indirect
+ golang.org/x/text v0.22.0 // indirect
+ golang.org/x/tools v0.30.0 // indirect
gopkg.in/ini.v1 v1.67.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
diff --git a/go.sum b/go.sum
index a4f66821..b4a97ae3 100644
--- a/go.sum
+++ b/go.sum
@@ -1,54 +1,42 @@
-git.archive.org/wb/gocrawlhq v1.2.13 h1:PqEhgtYqNEUWO2JEJUHmXT+nIwW9LRgb4ocUFANciQo=
-git.archive.org/wb/gocrawlhq v1.2.13/go.mod h1:JQIKgebFmpbxmEalNRjID3RwCxHkslt3PHAnum82KtM=
-github.com/CorentinB/warc v0.8.52 h1:k6lkq3uh6PkhZG+WKpPEkeQPmO1byb7MnSZaNT28SH4=
-github.com/CorentinB/warc v0.8.52/go.mod h1:NblONkMtoBB4TIigew6F6vakzu0z3YQTKNFS8U2FIn8=
-github.com/CorentinB/warc v0.8.53 h1:xVz3RMdZ6faAqTtLfcK1/yl8ZTansy+B2en//EZLUlM=
-github.com/CorentinB/warc v0.8.53/go.mod h1:NblONkMtoBB4TIigew6F6vakzu0z3YQTKNFS8U2FIn8=
-github.com/PuerkitoBio/goquery v1.9.3 h1:mpJr/ikUA9/GNJB/DBZcGeFDXUtosHRyRrwh7KGdTG0=
-github.com/PuerkitoBio/goquery v1.9.3/go.mod h1:1ndLHPdTz+DyQPICCWYlYQMPl0oXZj0G6D4LCYA6u4U=
-github.com/PuerkitoBio/goquery v1.10.0 h1:6fiXdLuUvYs2OJSvNRqlNPoBm6YABE226xrbavY5Wv4=
-github.com/PuerkitoBio/goquery v1.10.0/go.mod h1:TjZZl68Q3eGHNBA8CWaxAN7rOU1EbDz3CWuolcO5Yu4=
-github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M=
-github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY=
-github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
-github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
+github.com/CorentinB/warc v0.8.71 h1:b8RKjkoH0lkh9eU1RQirlh1Xh6dh9hs33WkDz6SLenM=
+github.com/CorentinB/warc v0.8.71/go.mod h1:1fAGpKVIWnIuC79VdcrX0TyF00CHFzZ2VOYF7g5ivzA=
+github.com/ImVexed/fasturl v0.0.0-20230304231329-4e41488060f3 h1:ClzzXMDDuUbWfNNZqGeYq4PnYOlwlOVIvSyNaIy0ykg=
+github.com/ImVexed/fasturl v0.0.0-20230304231329-4e41488060f3/go.mod h1:we0YA5CsBbH5+/NUzC/AlMmxaDtWlXeNsqrwXjTzmzA=
+github.com/PuerkitoBio/goquery v1.10.1 h1:Y8JGYUkXWTGRB6Ars3+j3kN0xg1YqqlwvdTV8WTFQcU=
+github.com/PuerkitoBio/goquery v1.10.1/go.mod h1:IYiHrOMps66ag56LEH7QYDDupKXyo5A8qrjIx3ZtujY=
+github.com/ada-url/goada v0.0.0-20250104020233-00cbf4dc9da1 h1:K54lYH7ZY/NHweMd9/R82dHaFelQQmwjEhUfwUqCqEk=
+github.com/ada-url/goada v0.0.0-20250104020233-00cbf4dc9da1/go.mod h1:+D/veNwI2mA1hDYLVrYSobYcLFWm6e3DJ/H/d/dxlu8=
+github.com/andybalholm/brotli v1.1.1 h1:PR2pgnyFznKEugtsUo0xLdDop5SKXd5Qf5ysW+7XdTA=
+github.com/andybalholm/brotli v1.1.1/go.mod h1:05ib4cKhjx3OQYUY22hTVd34Bc8upXjOLL2rKwwZBoA=
+github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
+github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs=
-github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 h1:DklsrG3dyBCFEj5IhUbnKptjxatkF07cF2ak3yi77so=
-github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw=
-github.com/aws/aws-sdk-go v1.55.5 h1:KKUZBfBoyqy5d3swXyiC7Q76ic40rYcbqH7qjh59kzU=
-github.com/aws/aws-sdk-go v1.55.5/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU=
-github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
-github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
-github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
-github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
-github.com/clbanning/mxj/v2 v2.7.0 h1:WA/La7UGCanFe5NpHF0Q3DNtnCsVoxbPKuyBNHWRyME=
-github.com/clbanning/mxj/v2 v2.7.0/go.mod h1:hNiWqW14h+kc+MdF9C6/YoRfjEJoR3ou6tn/Qo+ve2s=
-github.com/cloudflare/circl v1.4.0 h1:BV7h5MgrktNzytKmWjpOtdYrf0lkkbF8YMlBGPhJQrY=
-github.com/cloudflare/circl v1.4.0/go.mod h1:PDRU+oXvdD7KCtgKxW95M5Z8BpSCJXQORiZFnBQS5QU=
-github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
+github.com/cloudflare/circl v1.6.0 h1:cr5JKic4HI+LkINy2lg3W2jF8sHCVTBncJr5gIIq7qk=
+github.com/cloudflare/circl v1.6.0/go.mod h1:uddAzsPgqdMAYatqJ0lsjX1oECcQLIlRpzZh3pJrofs=
+github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/dolthub/maphash v0.1.0 h1:bsQ7JsF4FkkWyrP3oCnFJgrCUAFbFf3kOl4L/QxPDyQ=
+github.com/dolthub/maphash v0.1.0/go.mod h1:gkg4Ch4CdCDu5h6PMriVLawB7koZ+5ijb9puGMV50a4=
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
-github.com/elastic/elastic-transport-go/v8 v8.6.0 h1:Y2S/FBjx1LlCv5m6pWAF2kDJAHoSjSRSJCApolgfthA=
-github.com/elastic/elastic-transport-go/v8 v8.6.0/go.mod h1:YLHer5cj0csTzNFXoNQ8qhtGY1GTvSqPnKWKaqQE3Hk=
-github.com/elastic/go-elasticsearch/v8 v8.15.0 h1:IZyJhe7t7WI3NEFdcHnf6IJXqpRf+8S8QWLtZYYyBYk=
-github.com/elastic/go-elasticsearch/v8 v8.15.0/go.mod h1:HCON3zj4btpqs2N1jjsAy4a/fiAul+YBP00mBH4xik8=
-github.com/fatih/color v1.17.0 h1:GlRw1BRJxkpqUCBKzKOw098ed57fEsKeNjpTe3cSjK4=
-github.com/fatih/color v1.17.0/go.mod h1:YZ7TlrGPkiz6ku9fK3TLD/pl3CpsiFyu8N92HLgmosI=
+github.com/dvyukov/go-fuzz v0.0.0-20200318091601-be3528f3a813/go.mod h1:11Gm+ccJnvAhCNLlf5+cS9KjtbaD5I5zaZpFMsTHWTw=
github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8=
github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0=
github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
-github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA=
-github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM=
-github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
-github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY=
-github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
-github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
-github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
+github.com/fsnotify/fsnotify v1.8.0 h1:dAwr6QBTBZIkG8roQaJjGof0pp0EeF+tNV7YBP3F/8M=
+github.com/fsnotify/fsnotify v1.8.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
+github.com/gabriel-vasile/mimetype v1.4.8 h1:FfZ3gj38NjllZIeJAmMhr+qKL8Wu+nOoI3GqacKw1NM=
+github.com/gabriel-vasile/mimetype v1.4.8/go.mod h1:ByKUIKGjh1ODkGM1asKUbQZOLGrPjydw3hYPU2YU9t8=
+github.com/gammazero/deque v1.0.0 h1:LTmimT8H7bXkkCy6gZX7zNLtkbz4NdS2z8LZuor3j34=
+github.com/gammazero/deque v1.0.0/go.mod h1:iflpYvtGfM3U8S8j+sZEKIak3SAKYpA5/SQewgfXDKo=
+github.com/gdamore/encoding v1.0.1 h1:YzKZckdBL6jVt2Gc+5p82qhrGiqMdG/eNs6Wy0u3Uhw=
+github.com/gdamore/encoding v1.0.1/go.mod h1:0Z0cMFinngz9kS1QfMjCP8TY7em3bZYeeklsSDPivEo=
+github.com/gdamore/tcell/v2 v2.8.1 h1:KPNxyqclpWpWQlPLx6Xui1pMk8S+7+R37h3g07997NU=
+github.com/gdamore/tcell/v2 v2.8.1/go.mod h1:bj8ori1BG3OYMjmb3IklZVWfZUJ1UBQt9JXrOCOhGWw=
github.com/go-test/deep v1.1.0 h1:WOcxcdHcvdgThNXjw0t76K42FXTU7HpNQWHpA2HHNlg=
github.com/go-test/deep v1.1.0/go.mod h1:5C2ZWiW0ErCdrYzpqxLbTX7MG14M9iiw8DgHncVwcsE=
github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU=
@@ -61,74 +49,44 @@ github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5y
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM=
github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
-github.com/gomodule/redigo v1.9.2 h1:HrutZBLhSIU8abiSfW8pj8mPhOyMYjZT/wcA4/L9L9s=
-github.com/gomodule/redigo v1.9.2/go.mod h1:KsU3hiK/Ay8U42qpaJk+kuNa3C+spxapWpM+ywhcgtw=
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
-github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
-github.com/gosuri/uilive v0.0.4 h1:hUEBpQDj8D8jXgtCdBu7sWsy5sbW/5GhuO8KBwJ2jyY=
-github.com/gosuri/uilive v0.0.4/go.mod h1:V/epo5LjjlDE5RJUcqx8dbw+zc93y5Ya3yg8tfZ74VI=
-github.com/gosuri/uitable v0.0.4 h1:IG2xLKRvErL3uhY6e1BylFzG+aJiwQviDDTfOKeKTpY=
-github.com/gosuri/uitable v0.0.4/go.mod h1:tKR86bXuXPZazfOTG1FIzvjIdXzd0mo4Vtn16vt0PJo=
-github.com/grafov/m3u8 v0.12.0 h1:T6iTwTsSEtMcwkayef+FJO8kj+Sglr4Lh81Zj8Ked/4=
-github.com/grafov/m3u8 v0.12.0/go.mod h1:nqzOkfBiZJENr52zTVd/Dcl03yzphIMbJqkXGu+u080=
+github.com/grafana/pyroscope-go v1.2.0 h1:aILLKjTj8CS8f/24OPMGPewQSYlhmdQMBmol1d3KGj8=
+github.com/grafana/pyroscope-go v1.2.0/go.mod h1:2GHr28Nr05bg2pElS+dDsc98f3JTUh2f6Fz1hWXrqwk=
+github.com/grafana/pyroscope-go/godeltaprof v0.1.8 h1:iwOtYXeeVSAeYefJNaxDytgjKtUuKQbJqgAIjlnicKg=
+github.com/grafana/pyroscope-go/godeltaprof v0.1.8/go.mod h1:2+l7K7twW49Ct4wFluZD3tZ6e0SjanjcUUBPVD/UuGU=
+github.com/grafov/m3u8 v0.12.1 h1:DuP1uA1kvRRmGNAZ0m+ObLv1dvrfNO0TPx0c/enNk0s=
+github.com/grafov/m3u8 v0.12.1/go.mod h1:nqzOkfBiZJENr52zTVd/Dcl03yzphIMbJqkXGu+u080=
github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4=
github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI=
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
-github.com/internetarchive/gocrawlhq v1.2.13 h1:ALfUrWR7nRez5gWhHRJ7ZklIpGMjERGMUJqR4HBl4+8=
-github.com/internetarchive/gocrawlhq v1.2.13/go.mod h1:JQIKgebFmpbxmEalNRjID3RwCxHkslt3PHAnum82KtM=
-github.com/internetarchive/gocrawlhq v1.2.14 h1:g3MPMonpA6mTkCpjBvW3paeBHiH+gGgwSvkyX/lxu7s=
-github.com/internetarchive/gocrawlhq v1.2.14/go.mod h1:IOHVfWsptADzh+r2J+UnSm22EB9r8TiVVeAuP9WRFoc=
-github.com/internetarchive/gocrawlhq v1.2.15 h1:Llv6tvxxRUxoC9G4GsjkpbfKX0anbQUU+pwFiROlxzg=
-github.com/internetarchive/gocrawlhq v1.2.15/go.mod h1:Rjkyx2ttWDG4vzXOrl7ilzdtbODJ3XSe2PkO77bxSTs=
-github.com/internetarchive/gocrawlhq v1.2.16 h1:D9JJdLL8uqpHUDU3SxxcXUjQETbxnk08e9xo929xrlE=
-github.com/internetarchive/gocrawlhq v1.2.16/go.mod h1:Rjkyx2ttWDG4vzXOrl7ilzdtbODJ3XSe2PkO77bxSTs=
-github.com/internetarchive/gocrawlhq v1.2.17 h1:nSjFHpDp5C9Q8SrDPibC4Iiih6kpw18+2GnifJiVpO0=
-github.com/internetarchive/gocrawlhq v1.2.17/go.mod h1:Rjkyx2ttWDG4vzXOrl7ilzdtbODJ3XSe2PkO77bxSTs=
-github.com/internetarchive/gocrawlhq v1.2.18 h1:PPe7UqJ2NNOljn70SmUhoKdgPreeqRUk9XVrYShCn4w=
-github.com/internetarchive/gocrawlhq v1.2.18/go.mod h1:Rjkyx2ttWDG4vzXOrl7ilzdtbODJ3XSe2PkO77bxSTs=
-github.com/internetarchive/gocrawlhq v1.2.19 h1:bvDliaeWjt97x64bOf+rKXStQX7VE+ZON/I1FS3sQ6A=
-github.com/internetarchive/gocrawlhq v1.2.19/go.mod h1:gHrdMewIi5OBWE/xEZGqSrNHyTXPbt+h+XUWpp9fZek=
-github.com/internetarchive/gocrawlhq v1.2.20 h1:0mIIt9lhPacKr6L2JeISoopQ8EgzC3dISJ3ITGGbOp4=
-github.com/internetarchive/gocrawlhq v1.2.20/go.mod h1:gHrdMewIi5OBWE/xEZGqSrNHyTXPbt+h+XUWpp9fZek=
-github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg=
-github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
-github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U=
-github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
-github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
-github.com/klauspost/compress v1.17.10 h1:oXAz+Vh0PMUvJczoi+flxpnBEPxoER1IaAnU/NMPtT0=
-github.com/klauspost/compress v1.17.10/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0=
+github.com/internetarchive/gocrawlhq v1.2.28 h1:R21OPHyAe2GhdR7D84PRju/cqQxT2fmlKLZlN1jY3xc=
+github.com/internetarchive/gocrawlhq v1.2.28/go.mod h1:gHrdMewIi5OBWE/xEZGqSrNHyTXPbt+h+XUWpp9fZek=
+github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
+github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
-github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
-github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
-github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY=
-github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0=
-github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
-github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
-github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
-github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
-github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
+github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY=
+github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
+github.com/magiconair/properties v1.8.9 h1:nWcCbLq1N2v/cpNsy5WvQ37Fb+YElfq20WJ/a8RkpQM=
+github.com/magiconair/properties v1.8.9/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0=
github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc=
github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
-github.com/miekg/dns v1.1.62 h1:cN8OuEF1/x5Rq6Np+h1epln8OiyPWV+lROx9LxcGgIQ=
-github.com/miekg/dns v1.1.62/go.mod h1:mvDlcItzm+br7MToIKqkglaGhlFMHJ9DTNNWONWXbNQ=
+github.com/maypok86/otter v1.2.4 h1:HhW1Pq6VdJkmWwcZZq19BlEQkHtI8xgsQzBVXJU0nfc=
+github.com/maypok86/otter v1.2.4/go.mod h1:mKLfoI7v1HOmQMwFgX4QkRk23mX6ge3RDvjdHOWG4R4=
+github.com/miekg/dns v1.1.63 h1:8M5aAw6OMZfFXTT7K5V0Eu5YiiL8l7nUAkyN6C9YwaY=
+github.com/miekg/dns v1.1.63/go.mod h1:6NGHfjhpmr5lt3XPLuyfDJi5AXbNIPM9PY6H6sF1Nfs=
github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY=
github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo=
-github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
-github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
-github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
-github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
-github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
-github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
-github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
+github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e h1:fD57ERR4JtEqsWbfPhv4DMiApHyliiK5xCTNVSPiaAs=
+github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
github.com/onsi/ginkgo v1.7.0 h1:WSHQ+IS43OoUrWtD1/bbclrwK8TTH5hzp+umCiuxHgs=
github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
@@ -152,130 +110,146 @@ github.com/philippgille/gokv/util v0.7.0/go.mod h1:i9KLHbPxGiHLMhkix/CcDQhpPbCkJ
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
-github.com/prometheus/client_golang v1.20.4 h1:Tgh3Yr67PaOv/uTqloMsCEdeuFTatm5zIq5+qNN23vI=
-github.com/prometheus/client_golang v1.20.4/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE=
-github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
-github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
-github.com/prometheus/common v0.59.1 h1:LXb1quJHWm1P6wq/U824uxYi4Sg0oGvNeUm1z5dJoX0=
-github.com/prometheus/common v0.59.1/go.mod h1:GpWM7dewqmVYcd7SmRaiWVe9SSqjf0UrwnYnpEZNuT0=
-github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
-github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
github.com/refraction-networking/utls v1.6.7 h1:zVJ7sP1dJx/WtVuITug3qYUq034cDq9B2MR1K67ULZM=
github.com/refraction-networking/utls v1.6.7/go.mod h1:BC3O4vQzye5hqpmDTWUqi4P5DDhzJfkV1tdqtawQIH0=
-github.com/remeh/sizedwaitgroup v1.0.0 h1:VNGGFwNo/R5+MJBf6yrsr110p0m4/OX4S3DCy7Kyl5E=
-github.com/remeh/sizedwaitgroup v1.0.0/go.mod h1:3j2R4OIe/SeS6YDhICBy22RWjJC5eNCJ1V+9+NVNYlo=
+github.com/rivo/tview v0.0.0-20241227133733-17b7edb88c57 h1:LmsF7Fk5jyEDhJk0fYIqdWNuTxSyid2W42A0L2YWjGE=
+github.com/rivo/tview v0.0.0-20241227133733-17b7edb88c57/go.mod h1:02iFIz7K/A9jGCvrizLPvoqr4cEIx7q54RH5Qudkrss=
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
+github.com/rivo/uniseg v0.4.3/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
-github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
-github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
+github.com/rogpeppe/go-internal v1.13.2-0.20241226121412-a5dc8ff20d0a h1:w3tdWGKbLGBPtR/8/oO74W6hmz0qE5q0z9aqSAewaaM=
+github.com/rogpeppe/go-internal v1.13.2-0.20241226121412-a5dc8ff20d0a/go.mod h1:S8kfXMp+yh77OxPD4fdM6YUknrZpQxLhvxzS4gDHENY=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
-github.com/sagikazarmark/locafero v0.6.0 h1:ON7AQg37yzcRPU69mt7gwhFEBwxI6P9T4Qu3N51bwOk=
-github.com/sagikazarmark/locafero v0.6.0/go.mod h1:77OmuIc6VTraTXKXIs/uvUxKGUXjE1GbemJYHqdNjX0=
+github.com/sagikazarmark/locafero v0.7.0 h1:5MqpDsTGNDhY8sGp0Aowyf0qKsPrhewaLSsFaodPcyo=
+github.com/sagikazarmark/locafero v0.7.0/go.mod h1:2za3Cg5rMaTMoG/2Ulr9AwtFaIppKXTRYnozin4aB5k=
github.com/sagikazarmark/slog-shim v0.1.0 h1:diDBnUNK9N/354PgrxMywXnAwEr1QZcOr6gto+ugjYE=
github.com/sagikazarmark/slog-shim v0.1.0/go.mod h1:SrcSrq8aKtyuqEI1uvTDTK1arOWRIczQRv+GVI1AkeQ=
-github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
-github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
+github.com/samber/lo v1.49.1 h1:4BIFyVfuQSEpluc7Fua+j1NolZHiEHEpaSEKdsH0tew=
+github.com/samber/lo v1.49.1/go.mod h1:dO6KHFzUKXgP8LDhU0oI8d2hekjXnGOu0DB8Jecxd6o=
+github.com/samber/slog-multi v1.4.0 h1:pwlPMIE7PrbTHQyKWDU+RIoxP1+HKTNOujk3/kdkbdg=
+github.com/samber/slog-multi v1.4.0/go.mod h1:FsQ4Uv2L+E/8TZt+/BVgYZ1LoDWCbfCU21wVIoMMrO8=
github.com/sourcegraph/conc v0.3.0 h1:OQTbbt6P72L20UqAkXXuLOj79LfEanQ+YQFNpLA9ySo=
github.com/sourcegraph/conc v0.3.0/go.mod h1:Sdozi7LEKbFPqYX2/J+iBAM6HpqSLTASQIKqDmF7Mt0=
-github.com/spf13/afero v1.11.0 h1:WJQKhtpdm3v2IzqG8VMqrr6Rf3UYpEF239Jy9wNepM8=
-github.com/spf13/afero v1.11.0/go.mod h1:GH9Y3pIexgf1MTIWtNGyogA5MwRIDXGUr+hbWNoBjkY=
-github.com/spf13/cast v1.7.0 h1:ntdiHjuueXFgm5nzDRdOS4yfT43P5Fnud6DH50rz/7w=
-github.com/spf13/cast v1.7.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo=
-github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM=
-github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y=
-github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
-github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
+github.com/spf13/afero v1.12.0 h1:UcOPyRBYczmFn6yvphxkn9ZEOY65cpwGKb5mL36mrqs=
+github.com/spf13/afero v1.12.0/go.mod h1:ZTlWwG4/ahT8W7T0WQ5uYmjI9duaLQGy3Q2OAl4sk/4=
+github.com/spf13/cast v1.7.1 h1:cuNEagBQEHWN1FnbGEjCXL2szYEXqfJPbP2HNUaca9Y=
+github.com/spf13/cast v1.7.1/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo=
+github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo=
+github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0=
+github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o=
+github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/spf13/viper v1.19.0 h1:RWq5SEjt8o25SROyN3z2OrDB9l7RPd3lwTWU8EcEdcI=
github.com/spf13/viper v1.19.0/go.mod h1:GQUN9bilAbhU/jgc1bKs99f/suXKeUMct8Adx5+Ntkg=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
-github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
-github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
-github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
-github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
+github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
+github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
+github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
+github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8=
github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU=
github.com/syndtr/goleveldb v1.0.0 h1:fBdIW9lB4Iz0n9khmH8w27SJ3QEJ7+IgjPEwGSZiFdE=
github.com/syndtr/goleveldb v1.0.0/go.mod h1:ZVVdQEZoIme9iO1Ch2Jdy24qqXrMMOU6lpPAyBWyWuQ=
-github.com/telanflow/cookiejar v0.0.0-20190719062046-114449e86aa5 h1:gTQl5nPlc9B53vFOKM8aJHwxB2BW2kM49PVR5526GBg=
-github.com/telanflow/cookiejar v0.0.0-20190719062046-114449e86aa5/go.mod h1:qNgA5MKwTh103SxGTooqZMiKxZTaV9UV3KjN7I7Drig=
github.com/ulikunitz/xz v0.5.12 h1:37Nm15o69RwBkXM0J6A5OlE67RZTfzUxTj8fB3dfcsc=
github.com/ulikunitz/xz v0.5.12/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
+github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU=
+github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
-go.opentelemetry.io/otel v1.30.0 h1:F2t8sK4qf1fAmY9ua4ohFS/K+FUuOPemHUIXHtktrts=
-go.opentelemetry.io/otel v1.30.0/go.mod h1:tFw4Br9b7fOS+uEao81PJjVMjW/5fvNCbpsDIXqP0pc=
-go.opentelemetry.io/otel/metric v1.30.0 h1:4xNulvn9gjzo4hjg+wzIKG7iNFEaBMX00Qd4QIZs7+w=
-go.opentelemetry.io/otel/metric v1.30.0/go.mod h1:aXTfST94tswhWEb+5QjlSqG+cZlmyXy/u8jFpor3WqQ=
-go.opentelemetry.io/otel/sdk v1.21.0 h1:FTt8qirL1EysG6sTQRZ5TokkU8d0ugCj8htOgThZXQ8=
-go.opentelemetry.io/otel/sdk v1.21.0/go.mod h1:Nna6Yv7PWTdgJHVRD9hIYywQBRx7pbox6nwBnZIxl/E=
-go.opentelemetry.io/otel/trace v1.30.0 h1:7UBkkYzeg3C7kQX8VAidWh2biiQbtAKjyIML8dQ9wmc=
-go.opentelemetry.io/otel/trace v1.30.0/go.mod h1:5EyKqTzzmyqB9bwtCCq6pDLktPK6fmGf/Dph+8VI02o=
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
-golang.org/x/crypto v0.27.0 h1:GXm2NjJrPaiv/h1tb2UH8QfgC/hOf/+z0p6PT8o1w7A=
-golang.org/x/crypto v0.27.0/go.mod h1:1Xngt8kV6Dvbssa53Ziq6Eqn0HqbZi5Z6R0ZpwQzt70=
-golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 h1:e66Fs6Z+fZTbFBAxKfP3PALWBtpfqks2bwGcexMxgtk=
-golang.org/x/exp v0.0.0-20240909161429-701f63a606c0/go.mod h1:2TbTHSBQa924w8M6Xs1QcRcFwyucIwBGpK1p2f1YFFY=
+golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc=
+golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
+golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
+golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
+golang.org/x/crypto v0.33.0 h1:IOBPskki6Lysi0lo9qQvbxiQ+FvsCC/YWOecCHAixus=
+golang.org/x/crypto v0.33.0/go.mod h1:bVdXmD7IV/4GdElGPozy6U7lWdRXA4qyRVGJV57uQ5M=
+golang.org/x/exp v0.0.0-20250106191152-7588d65b2ba8 h1:yqrTHse8TCMW1M1ZCP+VAR/l0kKxwaAIqN/il7x4voA=
+golang.org/x/exp v0.0.0-20250106191152-7588d65b2ba8/go.mod h1:tujkw807nyEEAamNbDrEGzRav+ilXA7PCRAd6xsmwiU=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
-golang.org/x/mod v0.21.0 h1:vvrHzRwRfVKSiLrG+d4FMl/Qi4ukBCE6kZlTUkDYRT0=
-golang.org/x/mod v0.21.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY=
+golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
+golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
+golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
+golang.org/x/mod v0.23.0 h1:Zb7khfcRGKk+kqfxFaP5tZqCnDZMjC5VtUBs87Hr6QM=
+golang.org/x/mod v0.23.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY=
golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
-golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
-golang.org/x/net v0.29.0 h1:5ORfpBpCs4HzDYoodCDBbwHzdR5UrLBZ3sOnUJmFoHo=
-golang.org/x/net v0.29.0/go.mod h1:gLkgy8jTGERgjzMic6DS9+SP0ajcu6Xu3Orq/SpETg0=
+golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
+golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
+golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
+golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
+golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
+golang.org/x/net v0.35.0 h1:T5GQRQb2y08kTAByq9L4/bz8cipCdA8FbRTXewonqY8=
+golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk=
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ=
-golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
+golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w=
+golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34=
-golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
-golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo=
-golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/sys v0.29.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
+golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
-golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
+golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
+golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
+golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
+golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
+golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
+golang.org/x/term v0.28.0/go.mod h1:Sw/lC2IAUZ92udQNf3WodGtn4k/XoLyZoh8v/8uiwek=
+golang.org/x/term v0.29.0 h1:L6pJp37ocefwRRtYPKSWOWzOtWSxVajvz2ldH/xi3iU=
+golang.org/x/term v0.29.0/go.mod h1:6bl4lRlvVuDgSf3179VpIxBF0o10JUpXWOnI7nErv7s=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
-golang.org/x/text v0.18.0 h1:XvMDiNzPAl0jr17s6W9lcaIhGUfUORdGCNsuLmPG224=
-golang.org/x/text v0.18.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY=
+golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
+golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
+golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
+golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
+golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM=
+golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
-golang.org/x/tools v0.25.0 h1:oFU9pkj/iJgs+0DT+VMHrx+oBKs/LJMV+Uvg78sl+fE=
-golang.org/x/tools v0.25.0/go.mod h1:/vtpO8WL1N9cQC3FN5zPqb//fRXskFHbLKk4OW1Q7rg=
+golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58=
+golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
+golang.org/x/tools v0.30.0 h1:BgcpHewrV5AUp2G9MebG4XPFI1E2W41zU1SaqVA9vJY=
+golang.org/x/tools v0.30.0/go.mod h1:c347cR/OJfw5TI+GfX7RUPNMdDRRbjvYTS0jPyvsVtY=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg=
-google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
-gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
-gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
+gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f h1:BLraFXnmrev5lT+xlilqcH8XK9/i0At2xKjWk4p6zsU=
+gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/fsnotify.v1 v1.4.7 h1:xOHLXZwVvI9hhs+cLKq5+I5onOuwQLhQwiu63xxlHs4=
gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys=
gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA=
@@ -283,9 +257,8 @@ gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ=
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
-gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
-mvdan.cc/xurls/v2 v2.5.0 h1:lyBNOm8Wo71UknhUs4QTFUNNMyxy2JEIaKKo0RWOh+8=
-mvdan.cc/xurls/v2 v2.5.0/go.mod h1:yQgaGQ1rFtJUzkmKiHYSSfuQxqfYmd//X6PxvholpeE=
+mvdan.cc/xurls/v2 v2.6.0 h1:3NTZpeTxYVWNSokW3MKeyVkz/j7uYXYiMtXRUfmjbgI=
+mvdan.cc/xurls/v2 v2.6.0/go.mod h1:bCvEZ1XvdA6wDnxY7jPPjEmigDtvtvPXAD/Exa9IMSk=
diff --git a/internal/pkg/archiver/archiver.go b/internal/pkg/archiver/archiver.go
new file mode 100644
index 00000000..2c513dff
--- /dev/null
+++ b/internal/pkg/archiver/archiver.go
@@ -0,0 +1,249 @@
+package archiver
+
+import (
+ "context"
+ "fmt"
+ "net/http"
+ "strconv"
+ "sync"
+ "time"
+
+ "github.com/CorentinB/warc"
+ "github.com/dustin/go-humanize"
+ "github.com/gabriel-vasile/mimetype"
+ "github.com/internetarchive/Zeno/internal/pkg/config"
+ "github.com/internetarchive/Zeno/internal/pkg/controler/pause"
+ "github.com/internetarchive/Zeno/internal/pkg/log"
+ "github.com/internetarchive/Zeno/internal/pkg/postprocessor/domainscrawl"
+ "github.com/internetarchive/Zeno/internal/pkg/stats"
+ "github.com/internetarchive/Zeno/pkg/models"
+)
+
+func init() {
+ // We intentionally set the limit to 0 to disable the limit on the number of bytes the
+ // mimetype detection can accept. We limit the number of bytes that we will give to it
+ // in the processBody function instead.
+ mimetype.SetLimit(0)
+}
+
+type archiver struct {
+ wg sync.WaitGroup
+ ctx context.Context
+ cancel context.CancelFunc
+ inputCh chan *models.Item
+ outputCh chan *models.Item
+
+ Client *warc.CustomHTTPClient
+ ClientWithProxy *warc.CustomHTTPClient
+}
+
+var (
+ globalArchiver *archiver
+ once sync.Once
+ logger *log.FieldedLogger
+)
+
+// Start initializes the internal archiver structure, start the WARC writer and start routines, should only be called once and returns an error if called more than once
+func Start(inputChan, outputChan chan *models.Item) error {
+ var done bool
+
+ log.Start()
+ logger = log.NewFieldedLogger(&log.Fields{
+ "component": "archiver",
+ })
+
+ stats.Init()
+
+ once.Do(func() {
+ ctx, cancel := context.WithCancel(context.Background())
+ globalArchiver = &archiver{
+ ctx: ctx,
+ cancel: cancel,
+ inputCh: inputChan,
+ outputCh: outputChan,
+ }
+ logger.Debug("initialized")
+
+ // Setup WARC writing HTTP clients
+ startWARCWriter()
+ go watchWARCWritingQueue(250 * time.Millisecond)
+
+ logger.Debug("WARC writer started")
+
+ for i := 0; i < config.Get().WorkersCount; i++ {
+ globalArchiver.wg.Add(1)
+ go globalArchiver.worker(strconv.Itoa(i))
+ }
+
+ logger.Info("started")
+ done = true
+ })
+
+ if !done {
+ return ErrArchiverAlreadyInitialized
+ }
+
+ return nil
+}
+
+// Stop stops the archiver routines and the WARC writer
+func Stop() {
+ if globalArchiver != nil {
+ globalArchiver.cancel()
+ globalArchiver.wg.Wait()
+
+ // Wait for the WARC writing to finish
+ stopLocalWatcher := make(chan struct{})
+ go func() {
+ for {
+ select {
+ case <-stopLocalWatcher:
+ return
+ case <-time.After(1 * time.Second):
+ logger.Debug("waiting for WARC writing to finish", "queue_size", GetWARCWritingQueueSize(), "bytes_written", humanize.Bytes(uint64(warc.DataTotal.Value())))
+ }
+ }
+ }()
+ globalArchiver.Client.WaitGroup.Wait()
+ stopLocalWatcher <- struct{}{}
+ logger.Debug("WARC writing finished")
+ globalArchiver.Client.Close()
+ if globalArchiver.ClientWithProxy != nil {
+ globalArchiver.ClientWithProxy.WaitGroup.Wait()
+ globalArchiver.ClientWithProxy.Close()
+ }
+
+ watchWARCWritingQueueCancel()
+
+ logger.Info("stopped")
+ }
+}
+
+func (a *archiver) worker(workerID string) {
+ defer a.wg.Done()
+
+ logger := log.NewFieldedLogger(&log.Fields{
+ "component": "archiver.worker",
+ "worker_id": workerID,
+ })
+
+ defer logger.Debug("worker stopped")
+
+ // Subscribe to the pause controler
+ controlChans := pause.Subscribe()
+ defer pause.Unsubscribe(controlChans)
+
+ stats.ArchiverRoutinesIncr()
+ defer stats.ArchiverRoutinesDecr()
+
+ for {
+ select {
+ case <-a.ctx.Done():
+ logger.Debug("shutting down")
+ return
+ case <-controlChans.PauseCh:
+ logger.Debug("received pause event")
+ controlChans.ResumeCh <- struct{}{}
+ logger.Debug("received resume event")
+ case seed, ok := <-a.inputCh:
+ if ok {
+ logger.Debug("received seed", "seed", seed.GetShortID(), "depth", seed.GetDepth(), "hops", seed.GetURL().GetHops())
+
+ if err := seed.CheckConsistency(); err != nil {
+ panic(fmt.Sprintf("seed consistency check failed with err: %s, seed id %s", err.Error(), seed.GetShortID()))
+ }
+
+ if seed.GetStatus() != models.ItemPreProcessed && seed.GetStatus() != models.ItemGotRedirected && seed.GetStatus() != models.ItemGotChildren {
+ logger.Debug("skipping seed", "seed", seed.GetShortID(), "depth", seed.GetDepth(), "hops", seed.GetURL().GetHops(), "status", seed.GetStatus().String())
+ } else {
+ archive(workerID, seed)
+ }
+
+ select {
+ case <-a.ctx.Done():
+ logger.Debug("aborting seed due to stop", "seed", seed.GetShortID(), "depth", seed.GetDepth(), "hops", seed.GetURL().GetHops())
+ return
+ case a.outputCh <- seed:
+ }
+ }
+ }
+ }
+}
+
+func archive(workerID string, seed *models.Item) {
+ // TODO: rate limiting handling
+ logger := log.NewFieldedLogger(&log.Fields{
+ "component": "archiver.archive",
+ "worker_id": workerID,
+ })
+
+ var (
+ guard = make(chan struct{}, config.Get().MaxConcurrentAssets)
+ wg sync.WaitGroup
+ )
+
+ items, err := seed.GetNodesAtLevel(seed.GetMaxDepth())
+ if err != nil {
+ logger.Error("unable to get nodes at level", "err", err.Error(), "seed_id", seed.GetShortID())
+ panic(err)
+ }
+
+ for i := range items {
+ if items[i].GetStatus() != models.ItemPreProcessed {
+ logger.Debug("skipping item", "seed_id", seed.GetShortID(), "item_id", items[i].GetShortID(), "status", items[i].GetStatus().String(), "depth", items[i].GetDepth())
+ continue
+ }
+
+ guard <- struct{}{}
+
+ wg.Add(1)
+ go func(item *models.Item) {
+ defer wg.Done()
+ defer func() { <-guard }()
+ defer stats.URLsCrawledIncr()
+
+ var (
+ err error
+ resp *http.Response
+ )
+
+ // Execute the request
+ req := item.GetURL().GetRequest()
+ if req == nil {
+ panic("request is nil")
+ }
+ if config.Get().Proxy != "" {
+ resp, err = globalArchiver.ClientWithProxy.Do(req)
+ } else {
+ resp, err = globalArchiver.Client.Do(req)
+ }
+ if err != nil {
+ logger.Error("unable to execute request", "err", err.Error(), "seed_id", seed.GetShortID(), "item_id", item.GetShortID(), "depth", item.GetDepth(), "hops", item.GetURL().GetHops())
+ item.SetStatus(models.ItemFailed)
+ return
+ }
+
+ // Set the response in the URL
+ item.GetURL().SetResponse(resp)
+
+ // Process the body
+ err = ProcessBody(item.GetURL(), config.Get().DisableAssetsCapture, domainscrawl.Enabled(), config.Get().MaxHops, config.Get().WARCTempDir)
+ if err != nil {
+ logger.Error("unable to process body", "err", err.Error(), "item_id", item.GetShortID(), "seed_id", seed.GetShortID(), "depth", item.GetDepth(), "hops", item.GetURL().GetHops())
+ item.SetStatus(models.ItemFailed)
+ return
+ }
+
+ stats.HTTPReturnCodesIncr(strconv.Itoa(resp.StatusCode))
+
+ logger.Info("url archived", "url", item.GetURL().String(), "seed_id", seed.GetShortID(), "item_id", item.GetShortID(), "depth", item.GetDepth(), "hops", item.GetURL().GetHops(), "status", resp.StatusCode)
+
+ item.SetStatus(models.ItemArchived)
+ }(items[i])
+ }
+
+ // Wait for all goroutines to finish
+ wg.Wait()
+
+ return
+}
diff --git a/internal/pkg/archiver/body.go b/internal/pkg/archiver/body.go
new file mode 100644
index 00000000..34910afc
--- /dev/null
+++ b/internal/pkg/archiver/body.go
@@ -0,0 +1,125 @@
+package archiver
+
+import (
+ "bytes"
+ "io"
+ "strings"
+ "time"
+
+ "github.com/CorentinB/warc/pkg/spooledtempfile"
+ "github.com/gabriel-vasile/mimetype"
+ "github.com/internetarchive/Zeno/internal/pkg/config"
+ "github.com/internetarchive/Zeno/pkg/models"
+)
+
+// ProcessBody processes the body of a URL response, loading it into memory or a temporary file
+func ProcessBody(u *models.URL, disableAssetsCapture, domainsCrawl bool, maxHops int, WARCTempDir string) error {
+ defer u.GetResponse().Body.Close() // Ensure the response body is closed
+
+ // Retrieve the underlying TCP connection and apply a 10s read deadline
+ conn, ok := u.GetResponse().Body.(interface{ SetReadDeadline(time.Time) error })
+ if ok {
+ err := conn.SetReadDeadline(time.Now().Add(time.Duration(config.Get().HTTPReadDeadline)))
+ if err != nil {
+ return err
+ }
+ }
+
+ // If we are not capturing assets, not extracting outlinks, and domains crawl is disabled
+ // we can just consume and discard the body
+ if disableAssetsCapture && !domainsCrawl && maxHops == 0 {
+ if err := copyWithTimeout(io.Discard, u.GetResponse().Body, conn); err != nil {
+ return err
+ }
+ }
+
+ // Create a buffer to hold the body (first 2KB)
+ buffer := new(bytes.Buffer)
+ if err := copyWithTimeoutN(buffer, u.GetResponse().Body, 2048, conn); err != nil {
+ return err
+ }
+
+ // Detect and set MIME type
+ u.SetMIMEType(mimetype.Detect(buffer.Bytes()))
+
+ // Check if the MIME type requires post-processing
+ if (u.GetMIMEType().Parent() != nil && u.GetMIMEType().Parent().String() == "text/plain") ||
+ strings.Contains(u.GetMIMEType().String(), "text/") {
+
+ // Create a temp file with a 2MB memory buffer
+ spooledBuff := spooledtempfile.NewSpooledTempFile("zeno", WARCTempDir, 2097152, false, -1)
+ _, err := io.Copy(spooledBuff, buffer)
+ if err != nil {
+ closeErr := spooledBuff.Close()
+ if closeErr != nil {
+ panic(closeErr)
+ }
+ return err
+ }
+
+ // Read the rest of the body into the spooled buffer
+ if err := copyWithTimeout(spooledBuff, u.GetResponse().Body, conn); err != nil {
+ closeErr := spooledBuff.Close()
+ if closeErr != nil {
+ panic(closeErr)
+ }
+ return err
+ }
+
+ u.SetBody(spooledBuff)
+ u.RewindBody()
+
+ return nil
+ } else {
+ // Read the rest of the body but discard it
+ if err := copyWithTimeout(io.Discard, u.GetResponse().Body, conn); err != nil {
+ return err
+ }
+ }
+
+ return nil
+}
+
+// copyWithTimeout copies data and resets the read deadline after each successful read
+func copyWithTimeout(dst io.Writer, src io.Reader, conn interface{ SetReadDeadline(time.Time) error }) error {
+ buf := make([]byte, 4096)
+ for {
+ n, err := src.Read(buf)
+ if n > 0 {
+ // Reset the deadline after each successful read
+ if conn != nil {
+ err = conn.SetReadDeadline(time.Now().Add(time.Duration(config.Get().HTTPReadDeadline)))
+ if err != nil {
+ return err
+ }
+ }
+ if _, writeErr := dst.Write(buf[:n]); writeErr != nil {
+ return writeErr
+ }
+ }
+ if err != nil {
+ if err == io.EOF {
+ break
+ }
+ return err
+ }
+ }
+ return nil
+}
+
+// copyWithTimeoutN copies a limited number of bytes and applies the timeout
+func copyWithTimeoutN(dst io.Writer, src io.Reader, n int64, conn interface{ SetReadDeadline(time.Time) error }) error {
+ _, err := io.CopyN(dst, src, n)
+ if err != nil && err != io.EOF {
+ return err
+ }
+
+ // Reset deadline after partial read
+ if conn != nil {
+ err = conn.SetReadDeadline(time.Now().Add(time.Duration(config.Get().HTTPReadDeadline)))
+ if err != nil {
+ return err
+ }
+ }
+ return nil
+}
diff --git a/internal/pkg/archiver/error.go b/internal/pkg/archiver/error.go
new file mode 100644
index 00000000..d0af5b7b
--- /dev/null
+++ b/internal/pkg/archiver/error.go
@@ -0,0 +1,8 @@
+package archiver
+
+import "errors"
+
+var (
+ // ErrArchiverAlreadyInitialized is the error returned when the preprocess is already initialized
+ ErrArchiverAlreadyInitialized = errors.New("archiver already initialized")
+)
diff --git a/internal/pkg/archiver/warc.go b/internal/pkg/archiver/warc.go
new file mode 100644
index 00000000..b38f5607
--- /dev/null
+++ b/internal/pkg/archiver/warc.go
@@ -0,0 +1,138 @@
+package archiver
+
+import (
+ "context"
+ "os"
+ "path"
+ "time"
+
+ "github.com/CorentinB/warc"
+ "github.com/internetarchive/Zeno/internal/pkg/config"
+ "github.com/internetarchive/Zeno/internal/pkg/log"
+ "github.com/internetarchive/Zeno/internal/pkg/stats"
+)
+
+func startWARCWriter() {
+ // Configure WARC rotator settings
+ rotatorSettings := warc.NewRotatorSettings()
+ rotatorSettings.Prefix = config.Get().WARCPrefix
+ rotatorSettings.WARCWriterPoolSize = config.Get().WARCPoolSize
+ rotatorSettings.WarcSize = float64(config.Get().WARCSize)
+ rotatorSettings.OutputDirectory = path.Join(config.Get().JobPath, "warcs")
+
+ // Configure WARC dedupe settings
+ dedupeOptions := warc.DedupeOptions{LocalDedupe: !config.Get().DisableLocalDedupe, SizeThreshold: config.Get().WARCDedupeSize}
+ if config.Get().CDXDedupeServer != "" {
+ dedupeOptions = warc.DedupeOptions{
+ LocalDedupe: !config.Get().DisableLocalDedupe,
+ CDXDedupe: true,
+ CDXURL: config.Get().CDXDedupeServer,
+ CDXCookie: config.Get().CDXCookie,
+ SizeThreshold: config.Get().WARCDedupeSize,
+ }
+ }
+
+ // Configure WARC settings
+ WARCSettings := warc.HTTPClientSettings{
+ RotatorSettings: rotatorSettings,
+ DedupeOptions: dedupeOptions,
+ DecompressBody: true,
+ SkipHTTPStatusCodes: []int{429},
+ VerifyCerts: config.Get().CertValidation,
+ TempDir: config.Get().WARCTempDir,
+ FullOnDisk: config.Get().WARCOnDisk,
+ RandomLocalIP: config.Get().RandomLocalIP,
+ DisableIPv4: config.Get().DisableIPv4,
+ DisableIPv6: config.Get().DisableIPv6,
+ IPv6AnyIP: config.Get().IPv6AnyIP,
+ }
+
+ // Instantiate WARC client
+ var err error
+ if config.Get().Proxy != "" {
+ proxiedWARCSettings := WARCSettings
+ proxiedWARCSettings.Proxy = config.Get().Proxy
+ globalArchiver.ClientWithProxy, err = warc.NewWARCWritingHTTPClient(proxiedWARCSettings)
+ if err != nil {
+ logger.Error("unable to init proxied WARC HTTP client", "err", err.Error(), "func", "archiver.startWARCWriter")
+ os.Exit(1)
+ }
+
+ go func() {
+ for err := range globalArchiver.ClientWithProxy.ErrChan {
+ logger.Error("WARC writer error", "err", err.Err.Error(), "func", err.Func)
+ }
+ }()
+ }
+
+ // Even if a proxied client has been set, we want to create an non-proxied one
+ // if DomainsBypassProxy is used. The domains specified in this slice won't go
+ // through the proxied client, but through a "normal" client
+ if config.Get().Proxy == "" || len(config.Get().DomainsBypassProxy) > 0 {
+ globalArchiver.Client, err = warc.NewWARCWritingHTTPClient(WARCSettings)
+ if err != nil {
+ logger.Error("unable to init WARC HTTP client", "err", err.Error(), "func", "archiver.startWARCWriter")
+ os.Exit(1)
+ }
+
+ go func() {
+ for err := range globalArchiver.Client.ErrChan {
+ logger.Error("WARC writer error", "err", err.Err.Error(), "func", err.Func)
+ }
+ }()
+ }
+
+ // Set the timeouts
+ if config.Get().HTTPTimeout > 0 {
+ if globalArchiver.Client != nil {
+ globalArchiver.Client.Timeout = time.Duration(config.Get().HTTPTimeout) * time.Second
+ }
+
+ if globalArchiver.ClientWithProxy != nil {
+ globalArchiver.ClientWithProxy.Timeout = time.Duration(config.Get().HTTPTimeout) * time.Second
+ }
+ }
+}
+
+func GetClients() (clients []*warc.CustomHTTPClient) {
+ for _, c := range []*warc.CustomHTTPClient{globalArchiver.Client, globalArchiver.ClientWithProxy} {
+ if c != nil {
+ clients = append(clients, c)
+ }
+ }
+
+ return clients
+}
+
+func GetWARCWritingQueueSize() (total int) {
+ for _, c := range []*warc.CustomHTTPClient{globalArchiver.Client, globalArchiver.ClientWithProxy} {
+ if c != nil {
+ total += c.WaitGroup.Size()
+ }
+ }
+
+ return total
+}
+
+var (
+ watchWARCWritingQueueContext, watchWARCWritingQueueCancel = context.WithCancel(context.Background())
+)
+
+func watchWARCWritingQueue(interval time.Duration) {
+ logger := log.NewFieldedLogger(&log.Fields{
+ "component": "archiver.warcWritingQueueWatcher",
+ })
+
+ ticker := time.NewTicker(interval)
+ defer ticker.Stop()
+
+ for {
+ select {
+ case <-watchWARCWritingQueueContext.Done():
+ logger.Debug("closed")
+ return
+ case <-ticker.C:
+ stats.WarcWritingQueueSizeSet(int64(GetWARCWritingQueueSize()))
+ }
+ }
+}
diff --git a/internal/pkg/config/config.go b/internal/pkg/config/config.go
new file mode 100644
index 00000000..448b1663
--- /dev/null
+++ b/internal/pkg/config/config.go
@@ -0,0 +1,393 @@
+package config
+
+import (
+ "bufio"
+ "fmt"
+ "log/slog"
+ "net/http"
+ "net/url"
+ "os"
+ "path"
+ "path/filepath"
+ "regexp"
+ "strings"
+ "sync"
+ "time"
+
+ "github.com/google/uuid"
+ "github.com/internetarchive/Zeno/internal/pkg/postprocessor/domainscrawl"
+ "github.com/internetarchive/Zeno/internal/pkg/utils"
+ "github.com/spf13/pflag"
+ "github.com/spf13/viper"
+)
+
+// Config holds all configuration for our program, parsed from various sources
+// The `mapstructure` tags are used to map the fields to the viper configuration
+type Config struct {
+ Job string `mapstructure:"job"`
+ JobPath string
+
+ // UseSeencheck exists just for convenience of not checking
+ // !DisableSeencheck in the rest of the code, to make the code clearer
+ DisableSeencheck bool `mapstructure:"disable-seencheck"`
+ UseSeencheck bool
+
+ UserAgent string `mapstructure:"user-agent"`
+ Cookies string `mapstructure:"cookies"`
+ APIPort string `mapstructure:"api-port"`
+ PrometheusPrefix string `mapstructure:"prometheus-prefix"`
+ WARCPrefix string `mapstructure:"warc-prefix"`
+ WARCOperator string `mapstructure:"warc-operator"`
+ WARCTempDir string `mapstructure:"warc-temp-dir"`
+ WARCSize int `mapstructure:"warc-size"`
+ WARCOnDisk bool `mapstructure:"warc-on-disk"`
+ WARCPoolSize int `mapstructure:"warc-pool-size"`
+ WARCDedupeSize int `mapstructure:"warc-dedupe-size"`
+ CDXDedupeServer string `mapstructure:"warc-cdx-dedupe-server"`
+ CDXCookie string `mapstructure:"warc-cdx-cookie"`
+ HQAddress string `mapstructure:"hq-address"`
+ HQKey string `mapstructure:"hq-key"`
+ HQSecret string `mapstructure:"hq-secret"`
+ HQProject string `mapstructure:"hq-project"`
+ HQStrategy string `mapstructure:"hq-strategy"`
+ HQBatchSize int `mapstructure:"hq-batch-size"`
+ HQBatchConcurrency int `mapstructure:"hq-batch-concurrency"`
+ DisableHTMLTag []string `mapstructure:"disable-html-tag"`
+ ExcludeHosts []string `mapstructure:"exclude-host"`
+ IncludeHosts []string `mapstructure:"include-host"`
+ IncludeString []string `mapstructure:"include-string"`
+ ExcludeString []string `mapstructure:"exclude-string"`
+ ExclusionFile []string `mapstructure:"exclusion-file"`
+ WorkersCount int `mapstructure:"workers"`
+ MaxConcurrentAssets int `mapstructure:"max-concurrent-assets"`
+ MaxHops int `mapstructure:"max-hops"`
+ MaxRedirect int `mapstructure:"max-redirect"`
+ MaxRetry int `mapstructure:"max-retry"`
+ HTTPTimeout int `mapstructure:"http-timeout"`
+ HTTPReadDeadline int `mapstructure:"http-read-deadline"`
+ CrawlTimeLimit int `mapstructure:"crawl-time-limit"`
+ CrawlMaxTimeLimit int `mapstructure:"crawl-max-time-limit"`
+ MinSpaceRequired int `mapstructure:"min-space-required"`
+ KeepCookies bool `mapstructure:"keep-cookies"`
+ Headless bool `mapstructure:"headless"`
+ JSON bool `mapstructure:"json"`
+ API bool `mapstructure:"api"`
+ Prometheus bool `mapstructure:"prometheus"`
+ DomainsCrawl []string `mapstructure:"domains-crawl"`
+ CaptureAlternatePages bool `mapstructure:"capture-alternate-pages"`
+ DisableLocalDedupe bool `mapstructure:"disable-local-dedupe"`
+ CertValidation bool `mapstructure:"cert-validation"`
+ DisableAssetsCapture bool `mapstructure:"disable-assets-capture"`
+ UseHQ bool // Special field to check if HQ is enabled depending on the command called
+ HQRateLimitingSendBack bool `mapstructure:"hq-rate-limiting-send-back"`
+ NoBatchWriteWAL bool `mapstructure:"ultrasafe-queue"`
+ Handover bool `mapstructure:"handover"`
+
+ // Network
+ Proxy string `mapstructure:"proxy"`
+ DomainsBypassProxy []string `mapstructure:"bypass-proxy"`
+ RandomLocalIP bool `mapstructure:"random-local-ip"`
+ DisableIPv4 bool `mapstructure:"disable-ipv4"`
+ DisableIPv6 bool `mapstructure:"disable-ipv6"`
+ IPv6AnyIP bool `mapstructure:"ipv6-anyip"`
+
+ // Dependencies
+ NoYTDLP bool `mapstructure:"no-ytdlp"`
+ YTDLPPath string `mapstructure:"ytdlp-path"`
+
+ // Logging
+ NoStdoutLogging bool `mapstructure:"no-stdout-log"`
+ NoStderrLogging bool `mapstructure:"no-stderr-log"`
+ NoFileLogging bool `mapstructure:"no-log-file"`
+ StdoutLogLevel string `mapstructure:"log-level"`
+ TUI bool `mapstructure:"tui"`
+ TUILogLevel string `mapstructure:"tui-log-level"`
+ LogFileLevel string `mapstructure:"log-file-level"`
+ LogFileOutputDir string `mapstructure:"log-file-output-dir"`
+ LogFilePrefix string `mapstructure:"log-file-prefix"`
+ LogFileRotation string `mapstructure:"log-file-rotation"`
+ ElasticSearchURLs string `mapstructure:"log-es-urls"`
+ ElasticSearchUsername string `mapstructure:"log-es-user"`
+ ElasticSearchPassword string `mapstructure:"log-es-password"`
+ ElasticSearchLogLevel string `mapstructure:"log-es-log-level"`
+ ElasticSearchIndexPrefix string `mapstructure:"log-es-index-prefix"`
+ ElasticSearchRotation string `mapstructure:"log-es-rotation"`
+
+ // Profiling
+ PyroscopeAddress string `mapstructure:"pyroscope-address"`
+
+ InputSeeds []string // Special field to store the input URLs
+ ExclusionRegexes []*regexp.Regexp // Special field to store the compiled exclusion regex (from --exclusion-file)
+}
+
+var (
+ config *Config
+ once sync.Once
+)
+
+// InitConfig initializes the configuration
+// Flags -> Env -> Config file -> Consul config
+// Latest has precedence over the rest
+func InitConfig() error {
+ var err error
+ once.Do(func() {
+ config = &Config{}
+
+ // Check if a config file is provided via flag
+ if configFile := viper.GetString("config-file"); configFile != "" {
+ viper.SetConfigFile(configFile)
+ } else {
+ home, err := os.UserHomeDir()
+ if err != nil {
+ fmt.Println(err)
+ os.Exit(1)
+ }
+
+ viper.AddConfigPath(home)
+ viper.SetConfigType("yaml")
+ viper.SetConfigName("zeno-config")
+ }
+
+ viper.SetEnvPrefix("ZENO")
+ replacer := strings.NewReplacer("-", "_", ".", "_")
+ viper.SetEnvKeyReplacer(replacer)
+ viper.AutomaticEnv()
+
+ if err = viper.ReadInConfig(); err == nil {
+ fmt.Println("Using config file:", viper.ConfigFileUsed())
+ }
+
+ if viper.GetBool("consul-config") && viper.GetString("consul-address") != "" {
+ var consulAddress *url.URL
+ consulAddress, err = url.Parse(viper.GetString("consul-address"))
+ if err != nil {
+ return
+ }
+
+ consulPath, consulFile := filepath.Split(viper.GetString("consul-path"))
+ viper.AddRemoteProvider("consul", consulAddress.String(), consulPath)
+ viper.SetConfigType(filepath.Ext(consulFile))
+ viper.SetConfigName(strings.TrimSuffix(consulFile, filepath.Ext(consulFile)))
+
+ if err = viper.ReadInConfig(); err == nil {
+ fmt.Println("Using config file:", viper.ConfigFileUsed())
+ }
+ }
+
+ // This function is used to bring logic to the flags when needed (e.g. live-stats)
+ handleFlagsEdgeCases()
+
+ // This function is used to handle flags aliases (e.g. hops -> max-hops)
+ handleFlagsAliases()
+
+ // Unmarshal the config into the Config struct
+ err = viper.Unmarshal(config)
+ })
+ return err
+}
+
+// BindFlags binds the flags to the viper configuration
+// This is needed because viper doesn't support same flag name accross multiple commands
+// Details here: https://github.com/spf13/viper/issues/375#issuecomment-794668149
+func BindFlags(flagSet *pflag.FlagSet) {
+ flagSet.VisitAll(func(flag *pflag.Flag) {
+ viper.BindPFlag(flag.Name, flag)
+ })
+}
+
+// Get returns the config struct
+func Get() *Config {
+ return config
+}
+
+func GenerateCrawlConfig() error {
+ // If the job name isn't specified, we generate a random name
+ if config.Job == "" {
+ if config.HQProject != "" {
+ config.Job = config.HQProject
+ } else {
+ UUID, err := uuid.NewUUID()
+ if err != nil {
+ slog.Error("cmd/utils.go:InitCrawlWithCMD():uuid.NewUUID()", "error", err)
+ return err
+ }
+
+ config.Job = UUID.String()
+ }
+ }
+
+ config.JobPath = path.Join("jobs", config.Job)
+ config.UseSeencheck = !config.DisableSeencheck
+
+ // Defaults --max-crawl-time-limit to 10% more than --crawl-time-limit
+ if config.CrawlMaxTimeLimit == 0 && config.CrawlTimeLimit != 0 {
+ config.CrawlMaxTimeLimit = config.CrawlTimeLimit + (config.CrawlTimeLimit / 10)
+ }
+
+ // We exclude some hosts by default
+ config.ExcludeHosts = utils.DedupeStrings(append(config.ExcludeHosts, "archive.org", "archive-it.org"))
+
+ if config.WARCTempDir == "" {
+ config.WARCTempDir = path.Join(config.JobPath, "temp")
+ }
+
+ if config.UserAgent == "" {
+ version := utils.GetVersion()
+
+ // If Version is a commit hash, we only take the first 7 characters
+ if len(version.Version) >= 40 {
+ version.Version = version.Version[:7]
+ }
+
+ config.UserAgent = "Mozilla/5.0 (compatible; archive.org_bot +http://archive.org/details/archive.org_bot) Zeno/" + version.Version + " warc/" + version.WarcVersion
+ slog.Info("User-Agent set to", "user-agent", config.UserAgent)
+ }
+
+ if config.RandomLocalIP {
+ slog.Warn("Random local IP is enabled")
+ }
+
+ if config.DisableIPv4 && config.DisableIPv6 {
+ slog.Error("Both IPv4 and IPv6 are disabled, at least one of them must be enabled.")
+ os.Exit(1)
+ } else if config.DisableIPv4 {
+ slog.Info("IPv4 is disabled")
+ } else if config.DisableIPv6 {
+ slog.Info("IPv6 is disabled")
+ }
+
+ if len(config.ExclusionFile) > 0 {
+ for _, file := range config.ExclusionFile {
+ var (
+ regexes []string
+ err error
+ )
+
+ if strings.HasPrefix(file, "http://") || strings.HasPrefix(file, "https://") {
+ slog.Info("Reading (remote) exclusion file", "file", file)
+ regexes, err = readRemoteExclusionFile(file)
+ if err != nil {
+ return err
+ }
+ } else {
+ slog.Info("Reading (local) exclusion file", "file", file)
+ regexes, err = readLocalExclusionFile(file)
+ if err != nil {
+ return err
+ }
+ }
+
+ slog.Info("Compiling exclusion regexes", "regexes", len(regexes))
+ compiledRegexes := compileRegexes(regexes)
+
+ config.ExclusionRegexes = append(config.ExclusionRegexes, compiledRegexes...)
+ }
+ }
+
+ if len(config.DomainsCrawl) > 0 {
+ slog.Info("Domains crawl enabled", "domains/regex", config.DomainsCrawl)
+ err := domainscrawl.AddElements(config.DomainsCrawl)
+ if err != nil {
+ panic(err)
+ }
+ }
+
+ return nil
+}
+
+func compileRegexes(regexes []string) []*regexp.Regexp {
+ var compiledRegexes []*regexp.Regexp
+
+ for _, regex := range regexes {
+ slog.Debug("Compiling regex", "regex", regex)
+ compiledRegex := regexp.MustCompile(regex)
+
+ compiledRegexes = append(compiledRegexes, compiledRegex)
+ }
+
+ return compiledRegexes
+}
+
+func readLocalExclusionFile(file string) (regexes []string, err error) {
+ f, err := os.Open(file)
+ if err != nil {
+ return regexes, err
+ }
+ defer f.Close()
+
+ scanner := bufio.NewScanner(f)
+ for scanner.Scan() {
+ regexes = append(regexes, scanner.Text())
+ }
+
+ if err := scanner.Err(); err != nil {
+ return regexes, err
+ }
+
+ return regexes, nil
+}
+
+func readRemoteExclusionFile(URL string) (regexes []string, err error) {
+ httpClient := &http.Client{
+ Timeout: time.Second * 5,
+ }
+
+ req, err := http.NewRequest(http.MethodGet, URL, nil)
+ if err != nil {
+ return regexes, err
+ }
+
+ req.Header.Set("User-Agent", config.UserAgent)
+
+ resp, err := httpClient.Do(req)
+ if err != nil {
+ return regexes, err
+ }
+
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ return regexes, fmt.Errorf("failed to download exclusion file: %s", resp.Status)
+ }
+
+ // Read file line by line
+ scanner := bufio.NewScanner(resp.Body)
+ for scanner.Scan() {
+ regexes = append(regexes, scanner.Text())
+ }
+
+ if err := scanner.Err(); err != nil {
+ return regexes, err
+ }
+
+ return regexes, nil
+}
+
+func handleFlagsEdgeCases() {
+ if viper.GetBool("tui") {
+ // If live-stats is true, set no-stdout-log to true
+ viper.Set("no-stdout-log", true)
+ viper.Set("no-stderr-log", true)
+ }
+
+ if viper.GetBool("prometheus") {
+ // If prometheus is true, set no-stdout-log to true
+ viper.Set("api", true)
+ }
+}
+
+func handleFlagsAliases() {
+ // For each flag we want to alias, we check if the original flag is at default and if the alias is not
+ // If so, we set the original flag to the value of the alias
+ if viper.GetUint("hops") != 0 && viper.GetUint("max-hops") == 0 {
+ viper.Set("max-hops", viper.GetUint("hops"))
+ }
+
+ if viper.GetInt("ca") != 1 && viper.GetInt("max-concurrent-assets") == 1 {
+ viper.Set("max-concurrent-assets", viper.GetInt("ca"))
+ }
+
+ if viper.GetInt("msr") != 20 && viper.GetInt("min-space-required") == 20 {
+ viper.Set("min-space-required", viper.GetInt("msr"))
+ }
+}
diff --git a/internal/pkg/controler/channels.go b/internal/pkg/controler/channels.go
new file mode 100644
index 00000000..7d5a6984
--- /dev/null
+++ b/internal/pkg/controler/channels.go
@@ -0,0 +1,29 @@
+package controler
+
+import "github.com/internetarchive/Zeno/pkg/models"
+
+var (
+ stageChannels []chan *models.Item
+)
+
+func makeStageChannel(bufferSize ...int) chan *models.Item {
+ var parsedSize int
+
+ if len(bufferSize) == 0 {
+ parsedSize = 0
+ } else if len(bufferSize) == 1 {
+ parsedSize = bufferSize[0]
+ } else {
+ panic("makeStageChannel: too many arguments, variadic argument should be omitted or a single integer")
+ }
+
+ ch := make(chan *models.Item, parsedSize)
+ stageChannels = append(stageChannels, ch)
+ return ch
+}
+
+func closeStageChannels() {
+ for _, ch := range stageChannels {
+ close(ch)
+ }
+}
diff --git a/internal/pkg/controler/controler.go b/internal/pkg/controler/controler.go
new file mode 100644
index 00000000..4c39f252
--- /dev/null
+++ b/internal/pkg/controler/controler.go
@@ -0,0 +1,13 @@
+// Package controler provides a way to start and stop the pipeline.
+package controler
+
+// Start initializes the pipeline.
+func Start() {
+ startPipeline()
+}
+
+// Stop stops the pipeline.
+func Stop() {
+ stopPipeline()
+ closeStageChannels()
+}
diff --git a/internal/pkg/controler/pause/pause.go b/internal/pkg/controler/pause/pause.go
new file mode 100644
index 00000000..55ac1cbc
--- /dev/null
+++ b/internal/pkg/controler/pause/pause.go
@@ -0,0 +1,106 @@
+package pause
+
+import (
+ "sync"
+ "sync/atomic"
+
+ "github.com/internetarchive/Zeno/internal/pkg/stats"
+)
+
+type ControlChans struct {
+ PauseCh chan struct{}
+ ResumeCh chan struct{}
+}
+
+type pauseManager struct {
+ subscribers sync.Map // Map of *ControlChans to struct{}
+ isPaused atomic.Bool
+ message string
+}
+
+var manager = &pauseManager{}
+
+// Subscribe returns a ControlChans struct for the subscriber to use.
+func Subscribe() *ControlChans {
+ chans := &ControlChans{
+ PauseCh: make(chan struct{}, 1), // Buffered to ensure non-blocking sends
+ ResumeCh: make(chan struct{}), // Unbuffered, will block on send
+ }
+ manager.subscribers.Store(chans, struct{}{})
+ return chans
+}
+
+// Unsubscribe removes the subscriber and closes its channels.
+func Unsubscribe(chans *ControlChans) {
+ manager.subscribers.Delete(chans)
+ // Close channels safely (deferred to avoid panic if already closed).
+ defer func() {
+ recover()
+ }()
+ close(chans.PauseCh)
+ close(chans.ResumeCh)
+}
+
+// Pause sends a pause signal to all subscribers.
+func Pause(message ...string) {
+ swap := manager.isPaused.CompareAndSwap(false, true)
+ if !swap {
+ return
+ }
+
+ if len(message) == 0 {
+ message = append(message, "Paused")
+ }
+
+ manager.message = message[0]
+
+ manager.subscribers.Range(func(key, _ interface{}) bool {
+ chans := key.(*ControlChans)
+ // Send pause signal (non-blocking since PauseCh is buffered).
+ select {
+ case chans.PauseCh <- struct{}{}:
+ // Signal sent.
+ default:
+ // PauseCh already has a signal.
+ }
+ return true
+ })
+ stats.PausedSet()
+}
+
+// Resume reads from each subscriber's ResumeCh to unblock them.
+func Resume() {
+ var wg sync.WaitGroup
+ manager.subscribers.Range(func(key, _ interface{}) bool {
+ chans := key.(*ControlChans)
+ wg.Add(1)
+ go func(chans *ControlChans) {
+ defer wg.Done()
+ // Read from ResumeCh to unblock subscriber.
+ _, ok := <-chans.ResumeCh
+ if !ok {
+ // Channel closed; subscriber may have unsubscribed.
+ return
+ }
+ }(chans)
+ return true
+ })
+ // Wait for all subscribers to send on their ResumeCh.
+ wg.Wait()
+
+ swap := manager.isPaused.CompareAndSwap(true, false)
+ if !swap {
+ return
+ }
+ manager.message = ""
+
+ stats.PausedReset()
+}
+
+func IsPaused() bool {
+ return manager.isPaused.Load()
+}
+
+func GetMessage() string {
+ return manager.message
+}
diff --git a/internal/pkg/controler/pause/pause_test.go b/internal/pkg/controler/pause/pause_test.go
new file mode 100644
index 00000000..b22d429d
--- /dev/null
+++ b/internal/pkg/controler/pause/pause_test.go
@@ -0,0 +1,429 @@
+package pause
+
+import (
+ "context"
+ "sync"
+ "sync/atomic"
+ "testing"
+ "time"
+
+ "github.com/internetarchive/Zeno/internal/pkg/stats"
+)
+
+func TestBasicPauseResume(t *testing.T) {
+ stats.Init()
+ manager = &pauseManager{}
+
+ var wg sync.WaitGroup
+ wg.Add(1)
+
+ subscribed := make(chan struct{})
+ pausedCh := make(chan struct{})
+ resumedCh := make(chan struct{})
+
+ go func() {
+ defer wg.Done()
+ controlChans := Subscribe()
+ defer Unsubscribe(controlChans)
+
+ subscribed <- struct{}{}
+
+ for {
+ select {
+ case <-controlChans.PauseCh:
+ // Signal that we have received the pause signal
+ pausedCh <- struct{}{}
+ // Attempt to send to ResumeCh; blocks until Resume() reads from it.
+ controlChans.ResumeCh <- struct{}{}
+ // Signal that we have resumed
+ resumedCh <- struct{}{}
+ return // Exit after resuming.
+ default:
+ time.Sleep(10 * time.Millisecond) // Simulate work.
+ }
+ }
+ }()
+
+ // Wait for the goroutine to subscribe
+ <-subscribed
+
+ // Pause the system.
+ Pause()
+
+ // Wait for the goroutine to signal that it has paused
+ select {
+ case <-pausedCh:
+ // Paused successfully
+ case <-time.After(200 * time.Millisecond):
+ t.Fatal("Subscriber did not receive pause signal")
+ }
+
+ // Resume the system.
+ Resume()
+
+ // Wait for the goroutine to signal that it has resumed
+ select {
+ case <-resumedCh:
+ // Resumed successfully
+ case <-time.After(200 * time.Millisecond):
+ t.Fatal("Subscriber did not resume")
+ }
+
+ wg.Wait()
+}
+
+func TestMultipleSubscribers(t *testing.T) {
+ stats.Init()
+ manager = &pauseManager{}
+ const numSubscribers = 10
+ var wg sync.WaitGroup
+
+ subscribedChans := make([]chan struct{}, numSubscribers)
+ pausedChans := make([]chan struct{}, numSubscribers)
+ resumedChans := make([]chan struct{}, numSubscribers)
+
+ // Create multiple subscribers.
+ for i := 0; i < numSubscribers; i++ {
+ wg.Add(1)
+ subscribedChans[i] = make(chan struct{})
+ pausedChans[i] = make(chan struct{})
+ resumedChans[i] = make(chan struct{})
+
+ go func(idx int) {
+ defer wg.Done()
+ controlChans := Subscribe()
+ defer Unsubscribe(controlChans)
+
+ subscribedChans[idx] <- struct{}{}
+
+ for {
+ select {
+ case <-controlChans.PauseCh:
+ // Signal that we have paused
+ pausedChans[idx] <- struct{}{}
+ // Attempt to send to ResumeCh; blocks until Resume() reads from it.
+ controlChans.ResumeCh <- struct{}{}
+ // Signal that we have resumed
+ resumedChans[idx] <- struct{}{}
+ return // Exit after resuming.
+ default:
+ time.Sleep(10 * time.Millisecond) // Simulate work.
+ }
+ }
+ }(i)
+ }
+
+ // Wait for all subscribers to subscribe
+ for i := 0; i < numSubscribers; i++ {
+ <-subscribedChans[i]
+ }
+
+ // Pause the system.
+ Pause()
+
+ // Wait for all subscribers to acknowledge the pause
+ for i := 0; i < numSubscribers; i++ {
+ select {
+ case <-pausedChans[i]:
+ // Subscriber paused
+ case <-time.After(100 * time.Millisecond):
+ t.Fatalf("Subscriber %d did not receive pause signal", i)
+ }
+ }
+
+ // Resume the system.
+ Resume()
+
+ // Wait for all subscribers to acknowledge the resume
+ for i := 0; i < numSubscribers; i++ {
+ select {
+ case <-resumedChans[i]:
+ // Subscriber resumed
+ case <-time.After(100 * time.Millisecond):
+ t.Fatalf("Subscriber %d did not resume", i)
+ }
+ }
+
+ wg.Wait()
+}
+
+func TestSubscriberUnsubscribeDuringPause(t *testing.T) {
+ stats.Init()
+ manager = &pauseManager{}
+ var wg sync.WaitGroup
+ wg.Add(1)
+
+ subscribedCh := make(chan struct{})
+ pausedCh := make(chan struct{})
+
+ go func() {
+ defer wg.Done()
+ controlChans := Subscribe()
+ defer Unsubscribe(controlChans)
+
+ subscribedCh <- struct{}{}
+
+ for {
+ select {
+ case <-controlChans.PauseCh:
+ // Signal that we have paused
+ pausedCh <- struct{}{}
+ // Unsubscribe during pause.
+ Unsubscribe(controlChans)
+ return
+ default:
+ time.Sleep(10 * time.Millisecond) // Simulate work.
+ }
+ }
+ }()
+
+ // Wait for the subscriber to subscribe
+ <-subscribedCh
+
+ // Pause the system.
+ Pause()
+
+ // Wait for the subscriber to acknowledge the pause
+ select {
+ case <-pausedCh:
+ // Subscriber paused and unsubscribed
+ case <-time.After(100 * time.Millisecond):
+ t.Fatal("Subscriber did not receive pause signal")
+ }
+
+ // Resume the system.
+ Resume()
+ time.Sleep(100 * time.Millisecond) // Allow any processing.
+
+ wg.Wait()
+}
+
+func TestConcurrentPauseResume(t *testing.T) {
+ stats.Init()
+ manager = &pauseManager{}
+ const numSubscribers = 5
+ const numCycles = 10
+
+ var wg sync.WaitGroup
+ wg.Add(numSubscribers)
+
+ // Channels to signal pause and resume completions
+ subscribedCh := make(chan struct{})
+ pauseComplete := make(chan struct{})
+ resumeComplete := make(chan struct{})
+
+ // Channel to receive counts from goroutines
+ countsCh := make(chan struct {
+ pauses int32
+ resumes int32
+ }, numSubscribers)
+
+ // Create subscribers
+ for i := 0; i < numSubscribers; i++ {
+ go func() {
+ defer wg.Done()
+ controlChans := Subscribe()
+ defer Unsubscribe(controlChans)
+
+ subscribedCh <- struct{}{}
+
+ var pauses, resumes int32
+
+ for j := 0; j < numCycles; j++ {
+ // Wait for pause signal
+ <-controlChans.PauseCh
+ pauses++
+
+ // Signal that we've received the pause
+ pauseComplete <- struct{}{}
+
+ // Block until resumed
+ controlChans.ResumeCh <- struct{}{}
+ resumes++
+
+ // Signal that we've resumed
+ resumeComplete <- struct{}{}
+ }
+
+ // Send counts back to main goroutine
+ countsCh <- struct {
+ pauses int32
+ resumes int32
+ }{pauses, resumes}
+ }()
+ }
+
+ // Wait for all subscribers to subscribe
+ for i := 0; i < numSubscribers; i++ {
+ <-subscribedCh
+ }
+
+ // Perform pause and resume cycles
+ for i := 0; i < numCycles; i++ {
+ // Perform pause
+ Pause()
+
+ // Wait for all subscribers to acknowledge the pause
+ for j := 0; j < numSubscribers; j++ {
+ <-pauseComplete
+ }
+
+ // Perform resume
+ Resume()
+
+ // Wait for all subscribers to acknowledge the resume
+ for j := 0; j < numSubscribers; j++ {
+ <-resumeComplete
+ }
+ }
+
+ // Wait for all subscribers to finish
+ wg.Wait()
+ close(countsCh)
+
+ // Verify that all subscribers have processed the correct number of pauses and resumes
+ for counts := range countsCh {
+ if counts.pauses != numCycles {
+ t.Fatalf("Subscriber expected to process %d pauses, but processed %d", numCycles, counts.pauses)
+ }
+ if counts.resumes != numCycles {
+ t.Fatalf("Subscriber expected to process %d resumes, but processed %d", numCycles, counts.resumes)
+ }
+ }
+}
+
+func TestPauseResumeWithUnsubscribe(t *testing.T) {
+ stats.Init()
+ manager = &pauseManager{}
+ var wg sync.WaitGroup
+ wg.Add(1)
+
+ subscribedCh := make(chan struct{})
+ pausedCh := make(chan struct{})
+ resumedCh := make(chan struct{})
+
+ go func() {
+ defer wg.Done()
+ controlChans := Subscribe()
+ subscribedCh <- struct{}{}
+ // Unsubscribe after resuming.
+
+ for {
+ select {
+ case <-controlChans.PauseCh:
+ // Signal that we have paused
+ pausedCh <- struct{}{}
+ // Attempt to send to ResumeCh; blocks until Resume() reads from it.
+ controlChans.ResumeCh <- struct{}{}
+ // Signal that we have resumed
+ resumedCh <- struct{}{}
+ // Unsubscribe after resuming.
+ Unsubscribe(controlChans)
+ return
+ default:
+ time.Sleep(10 * time.Millisecond) // Simulate work.
+ }
+ }
+ }()
+
+ // Wait for the subscriber to subscribe
+ <-subscribedCh
+
+ // Pause the system.
+ Pause()
+
+ // Wait for the subscriber to acknowledge pause
+ select {
+ case <-pausedCh:
+ // Subscriber paused
+ case <-time.After(100 * time.Millisecond):
+ t.Fatal("Subscriber did not receive pause signal")
+ }
+
+ // Resume the system.
+ Resume()
+
+ // Wait for the subscriber to acknowledge resume
+ select {
+ case <-resumedCh:
+ // Subscriber resumed
+ case <-time.After(100 * time.Millisecond):
+ t.Fatal("Subscriber did not resume")
+ }
+
+ wg.Wait()
+}
+
+func TestNoSubscribers(t *testing.T) {
+ stats.Init()
+ manager = &pauseManager{}
+ // Call Pause() and Resume() when there are no subscribers.
+ // If no panic occurs, the test passes.
+ Pause()
+ Resume()
+}
+
+func TestPauseResumeE2E(t *testing.T) {
+ stats.Init()
+ manager = &pauseManager{}
+ var workCounter int32 // Counts the amount of work done.
+ var wg sync.WaitGroup
+ wg.Add(1)
+
+ ctx, cancel := context.WithCancel(context.Background())
+
+ // Start the worker goroutine.
+ go func() {
+ controlChans := Subscribe()
+ defer Unsubscribe(controlChans)
+ defer wg.Done()
+ for {
+ select {
+ case <-ctx.Done():
+ return
+ case <-controlChans.PauseCh:
+ // Attempt to send to ResumeCh; blocks until Resume() reads from it.
+ controlChans.ResumeCh <- struct{}{}
+ default:
+ // Simulate work.
+ atomic.AddInt32(&workCounter, 1)
+ time.Sleep(100 * time.Millisecond)
+ }
+ }
+ }()
+
+ // Allow the worker to do some work.
+ time.Sleep(1 * time.Second)
+ workBeforePause := atomic.LoadInt32(&workCounter)
+
+ // Pause the system.
+ Pause()
+ pauseStart := time.Now()
+
+ // Sleep for 1 second to keep the system paused.
+ time.Sleep(1 * time.Second)
+
+ // Resume the system.
+ Resume()
+ pauseDuration := time.Since(pauseStart)
+
+ // Allow the worker to do more work.
+ time.Sleep(1 * time.Second)
+ workAfterResume := atomic.LoadInt32(&workCounter)
+
+ // Calculate the amount of work done during the pause.
+ workDuringPause := workAfterResume - workBeforePause - 10 // Expected 10 units of work after resume.
+
+ // Check that no work was done during the pause.
+ if workDuringPause != 0 {
+ t.Fatalf("Expected no work during pause, but got %d units of work", workDuringPause)
+ }
+
+ // Verify that the pause duration is approximately 1 second.
+ if pauseDuration < 900*time.Millisecond || pauseDuration > 1100*time.Millisecond {
+ t.Fatalf("Expected pause duration around 1 second, but got %v", pauseDuration)
+ }
+
+ cancel()
+ wg.Wait()
+}
diff --git a/internal/pkg/controler/pipeline.go b/internal/pkg/controler/pipeline.go
new file mode 100644
index 00000000..02937798
--- /dev/null
+++ b/internal/pkg/controler/pipeline.go
@@ -0,0 +1,184 @@
+package controler
+
+import (
+ "fmt"
+ "os"
+ "time"
+
+ "github.com/google/uuid"
+ "github.com/internetarchive/Zeno/internal/pkg/archiver"
+ "github.com/internetarchive/Zeno/internal/pkg/config"
+ "github.com/internetarchive/Zeno/internal/pkg/controler/watchers"
+ "github.com/internetarchive/Zeno/internal/pkg/finisher"
+ "github.com/internetarchive/Zeno/internal/pkg/log"
+ "github.com/internetarchive/Zeno/internal/pkg/postprocessor"
+ "github.com/internetarchive/Zeno/internal/pkg/preprocessor"
+ "github.com/internetarchive/Zeno/internal/pkg/preprocessor/seencheck"
+ "github.com/internetarchive/Zeno/internal/pkg/reactor"
+ "github.com/internetarchive/Zeno/internal/pkg/source/hq"
+ "github.com/internetarchive/Zeno/internal/pkg/stats"
+ "github.com/internetarchive/Zeno/pkg/models"
+)
+
+func startPipeline() {
+ if err := os.MkdirAll(config.Get().JobPath, 0755); err != nil {
+ fmt.Printf("can't create job directory: %s\n", err)
+ os.Exit(1)
+ }
+
+ if err := watchers.CheckDiskUsage(config.Get().JobPath); err != nil {
+ fmt.Printf("can't start Zeno: %s\n", err)
+ os.Exit(1)
+ }
+
+ err := log.Start()
+ if err != nil {
+ fmt.Println("error starting logger", "err", err.Error())
+ panic(err)
+ }
+
+ logger := log.NewFieldedLogger(&log.Fields{
+ "component": "controler.StartPipeline",
+ })
+
+ err = stats.Init()
+ if err != nil {
+ logger.Error("error initializing stats", "err", err.Error())
+ panic(err)
+ }
+
+ // Start the disk watcher
+ go watchers.WatchDiskSpace(config.Get().JobPath, 5*time.Second)
+
+ // Start the reactor that will receive
+ reactorOutputChan := makeStageChannel(config.Get().WorkersCount)
+ err = reactor.Start(config.Get().WorkersCount, reactorOutputChan)
+ if err != nil {
+ logger.Error("error starting reactor", "err", err.Error())
+ panic(err)
+ }
+
+ // If needed, create the seencheck DB (only if not using HQ)
+ if config.Get().UseSeencheck && !config.Get().UseHQ {
+ err := seencheck.Start(config.Get().JobPath)
+ if err != nil {
+ logger.Error("unable to start seencheck", "err", err.Error())
+ panic(err)
+ }
+ }
+
+ preprocessorOutputChan := makeStageChannel(config.Get().WorkersCount)
+ err = preprocessor.Start(reactorOutputChan, preprocessorOutputChan)
+ if err != nil {
+ logger.Error("error starting preprocessor", "err", err.Error())
+ panic(err)
+ }
+
+ archiverOutputChan := makeStageChannel(config.Get().WorkersCount)
+ err = archiver.Start(preprocessorOutputChan, archiverOutputChan)
+ if err != nil {
+ logger.Error("error starting archiver", "err", err.Error())
+ panic(err)
+ }
+
+ // Start the WARC writing queue watcher
+ go watchers.WatchWARCWritingQueue(5 * time.Second)
+
+ postprocessorOutputChan := makeStageChannel(config.Get().WorkersCount)
+ err = postprocessor.Start(archiverOutputChan, postprocessorOutputChan)
+ if err != nil {
+ logger.Error("error starting postprocessor", "err", err.Error())
+ panic(err)
+ }
+
+ finisherFinishChan := makeStageChannel(config.Get().WorkersCount)
+ finisherProduceChan := makeStageChannel(config.Get().WorkersCount)
+
+ if config.Get().UseHQ {
+ logger.Info("starting hq")
+ err = hq.Start(finisherFinishChan, finisherProduceChan)
+ if err != nil {
+ logger.Error("error starting hq source, retrying", "err", err.Error())
+ panic(err)
+ }
+ } else {
+ // Means we're using the to-be-implemented local queue, for the moment we're just gonna consume the channels
+ go func() {
+ for {
+ select {
+ case _, ok := <-finisherFinishChan:
+ if !ok {
+ return
+ }
+ case _, ok := <-finisherProduceChan:
+ if !ok {
+ return
+ }
+ }
+ }
+ }()
+ }
+
+ err = finisher.Start(postprocessorOutputChan, finisherFinishChan, finisherProduceChan)
+ if err != nil {
+ logger.Error("error starting finisher", "err", err.Error())
+ panic(err)
+ }
+
+ // Pipe in the reactor the input seeds if any
+ if len(config.Get().InputSeeds) > 0 {
+ for _, seed := range config.Get().InputSeeds {
+ parsedURL := &models.URL{Raw: seed}
+ err := parsedURL.Parse()
+ if err != nil {
+ panic(err)
+ }
+
+ item := models.NewItem(uuid.New().String(), parsedURL, "")
+ item.SetSource(models.ItemSourceQueue)
+
+ err = reactor.ReceiveInsert(item)
+ if err != nil {
+ logger.Error("unable to insert seed", "err", err.Error())
+ panic(err)
+ }
+ }
+ }
+}
+
+func stopPipeline() {
+ logger := log.NewFieldedLogger(&log.Fields{
+ "component": "controler.stopPipeline",
+ })
+
+ watchers.StopDiskWatcher()
+ watchers.StopWARCWritingQueueWatcher()
+
+ reactor.Freeze()
+
+ preprocessor.Stop()
+ archiver.Stop()
+ postprocessor.Stop()
+ finisher.Stop()
+
+ if config.Get().UseSeencheck && !config.Get().UseHQ {
+ seencheck.Close()
+ }
+
+ if config.Get().UseHQ {
+ hq.Stop()
+ }
+
+ reactor.Stop()
+
+ if config.Get().WARCTempDir != "" {
+ err := os.Remove(config.Get().WARCTempDir)
+ if err != nil {
+ logger.Error("unable to remove temp dir", "err", err.Error())
+ }
+ }
+
+ logger.Info("done, logs are flushing and will be closed")
+
+ log.Stop()
+}
diff --git a/internal/pkg/controler/signal.go b/internal/pkg/controler/signal.go
new file mode 100644
index 00000000..ee59feb3
--- /dev/null
+++ b/internal/pkg/controler/signal.go
@@ -0,0 +1,38 @@
+package controler
+
+import (
+ "context"
+ "os"
+ "os/signal"
+ "syscall"
+
+ "github.com/internetarchive/Zeno/internal/pkg/log"
+)
+
+var signalWatcherCtx, signalWatcherCancel = context.WithCancel(context.Background())
+
+// WatchSignals listens for OS signals and handles them gracefully
+func WatchSignals() {
+ logger := log.NewFieldedLogger(&log.Fields{
+ "component": "controler.signalWatcher",
+ })
+ // Handle OS signals for graceful shutdown
+ signalChan := make(chan os.Signal, 1)
+ signal.Notify(signalChan, syscall.SIGINT, syscall.SIGTERM)
+
+ select {
+ case <-signalWatcherCtx.Done():
+ return
+ case <-signalChan:
+ logger.Info("received shutdown signal, stopping services...")
+ // Catch a second signal to force exit
+ go func() {
+ <-signalChan
+ logger.Info("received second shutdown signal, forcing exit...")
+ os.Exit(1)
+ }()
+
+ Stop()
+ os.Exit(0)
+ }
+}
diff --git a/internal/pkg/controler/watchers/disk.go b/internal/pkg/controler/watchers/disk.go
new file mode 100644
index 00000000..a1d16c20
--- /dev/null
+++ b/internal/pkg/controler/watchers/disk.go
@@ -0,0 +1,98 @@
+package watchers
+
+import (
+ "context"
+ "fmt"
+ "sync"
+ "syscall"
+ "time"
+
+ "github.com/internetarchive/Zeno/internal/pkg/controler/pause"
+ "github.com/internetarchive/Zeno/internal/pkg/log"
+)
+
+var (
+ diskWatcherCtx, diskWatcherCancel = context.WithCancel(context.Background())
+ diskWatcherWg sync.WaitGroup
+)
+
+// Implements f(x)={ if total <= 256GB then threshold = 50GB * (total / 256GB) else threshold = 50GB }
+func checkThreshold(total, free uint64) error {
+ const (
+ GB = 1024 * 1024 * 1024
+ )
+ var threshold float64
+
+ if total <= 256*GB {
+ threshold = float64(50*GB) * (float64(total) / float64(256*GB))
+ } else {
+ threshold = 50 * GB
+ }
+
+ // Compare free space with threshold
+ if free < uint64(threshold) {
+ return fmt.Errorf("low disk space: free=%.2f GB, threshold=%.2f GB", float64(free)/1e9, float64(threshold)/1e9)
+ }
+
+ return nil
+}
+
+func CheckDiskUsage(path string) error {
+ var stat syscall.Statfs_t
+ if err := syscall.Statfs(path, &stat); err != nil {
+ panic(fmt.Sprintf("Error retrieving disk stats: %v\n", err))
+ }
+
+ total := stat.Blocks * uint64(stat.Bsize)
+ free := stat.Bavail * uint64(stat.Bsize)
+
+ return checkThreshold(total, free)
+}
+
+// WatchDiskSpace watches the disk space and pauses the pipeline if it's low
+func WatchDiskSpace(path string, interval time.Duration) {
+ diskWatcherWg.Add(1)
+ defer diskWatcherWg.Done()
+
+ logger := log.NewFieldedLogger(&log.Fields{
+ "component": "controler.diskWatcher",
+ })
+
+ paused := false
+ returnASAP := false
+ ticker := time.NewTicker(interval)
+ defer ticker.Stop()
+
+ for {
+ select {
+ case <-diskWatcherCtx.Done():
+ defer logger.Debug("closed")
+ if paused {
+ logger.Info("returning after resume")
+ returnASAP = true
+ }
+ return
+ case <-ticker.C:
+ err := CheckDiskUsage(path)
+
+ if err != nil && !paused {
+ logger.Warn("Low disk space, pausing the pipeline", "err", err.Error())
+ pause.Pause("Not enough disk space!!!")
+ paused = true
+ } else if err == nil && paused {
+ logger.Info("Disk space is sufficient, resuming the pipeline")
+ pause.Resume()
+ paused = false
+ if returnASAP {
+ return
+ }
+ }
+ }
+ }
+}
+
+// StopDiskWatcher stops the disk watcher by canceling the context and waiting for the goroutine to finish.
+func StopDiskWatcher() {
+ diskWatcherCancel()
+ diskWatcherWg.Wait()
+}
diff --git a/internal/pkg/controler/watchers/disk_test.go b/internal/pkg/controler/watchers/disk_test.go
new file mode 100644
index 00000000..3f8bdcdc
--- /dev/null
+++ b/internal/pkg/controler/watchers/disk_test.go
@@ -0,0 +1,54 @@
+package watchers
+
+import (
+ "testing"
+)
+
+func TestCheckThreshold(t *testing.T) {
+ tests := []struct {
+ name string
+ total uint64
+ free uint64
+ wantError bool
+ }{
+ {
+ name: "Low disk space on large disk",
+ total: 300 * 1024 * 1024 * 1024, // 300 GiB
+ free: 15 * 1024 * 1024 * 1024, // 15 GiB
+ wantError: true,
+ },
+ {
+ name: "Sufficient disk space on large disk",
+ total: 300 * 1024 * 1024 * 1024, // 300 GiB
+ free: 50 * 1024 * 1024 * 1024, // 50 GiB
+ wantError: false,
+ },
+ {
+ name: "Low disk space on small disk",
+ total: 100 * 1024 * 1024 * 1024, // 100 GiB
+ free: 3 * 1024 * 1024 * 1024, // 3 GiB
+ wantError: true,
+ },
+ {
+ name: "Sufficient disk space on small disk",
+ total: 100 * 1024 * 1024 * 1024, // 100 GiB
+ free: 60 * 1024 * 1024 * 1024, // 10 GiB
+ wantError: false,
+ },
+ {
+ name: "Edge case: exactly at threshold for small disk",
+ total: 300 * 1024 * 1024 * 1024, // 200 GiB
+ free: uint64((50 * 1024 * 1024 * 1024) * (float64(300*1024*1024*1024) / float64(256*1024*1024*1024))), // Threshold value
+ wantError: false,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ err := checkThreshold(tt.total, tt.free)
+ if (err != nil) != tt.wantError {
+ t.Errorf("checkThreshold() error = %v, wantError %v", err, tt.wantError)
+ }
+ })
+ }
+}
diff --git a/internal/pkg/controler/watchers/warc.go b/internal/pkg/controler/watchers/warc.go
new file mode 100644
index 00000000..b41396b9
--- /dev/null
+++ b/internal/pkg/controler/watchers/warc.go
@@ -0,0 +1,67 @@
+package watchers
+
+import (
+ "context"
+ "sync"
+ "time"
+
+ "github.com/internetarchive/Zeno/internal/pkg/archiver"
+ "github.com/internetarchive/Zeno/internal/pkg/config"
+ "github.com/internetarchive/Zeno/internal/pkg/controler/pause"
+ "github.com/internetarchive/Zeno/internal/pkg/log"
+)
+
+var (
+ wwqCtx, wwqCancel = context.WithCancel(context.Background())
+ wwqWg sync.WaitGroup
+)
+
+// WatchWARCWritingQueue watches the WARC writing queue size and pauses the pipeline if it exceeds the worker count
+func WatchWARCWritingQueue(interval time.Duration) {
+ wwqWg.Add(1)
+ defer wwqWg.Done()
+
+ logger := log.NewFieldedLogger(&log.Fields{
+ "component": "controler.warcWritingQueueWatcher",
+ })
+
+ paused := false
+ returnASAP := false
+ ticker := time.NewTicker(interval)
+ defer ticker.Stop()
+
+ for {
+ select {
+ case <-wwqCtx.Done():
+ defer logger.Debug("closed")
+ if paused {
+ logger.Info("returning after resume")
+ returnASAP = true
+ }
+ return
+ case <-ticker.C:
+ queueSize := archiver.GetWARCWritingQueueSize()
+
+ logger.Debug("checking queue size", "queue_size", queueSize, "max_queue_size", config.Get().WorkersCount, "paused", paused)
+
+ if queueSize > config.Get().WorkersCount && !paused {
+ logger.Warn("WARC writing queue exceeded the worker count, pausing the pipeline")
+ pause.Pause("WARC writing queue exceeded the worker count")
+ paused = true
+ } else if queueSize < config.Get().WorkersCount && paused {
+ logger.Info("WARC writing queue size returned to acceptable, resuming the pipeline")
+ pause.Resume()
+ paused = false
+ if returnASAP {
+ return
+ }
+ }
+ }
+ }
+}
+
+// StopWARCWritingQueueWatcher stops the WARC writing queue watcher by canceling the context and waiting for the goroutine to finish
+func StopWARCWritingQueueWatcher() {
+ wwqCancel()
+ wwqWg.Wait()
+}
diff --git a/internal/pkg/crawl/dependencies/ytdlp/model.go b/internal/pkg/crawl/dependencies/ytdlp/model.go
deleted file mode 100644
index c0e2c503..00000000
--- a/internal/pkg/crawl/dependencies/ytdlp/model.go
+++ /dev/null
@@ -1,114 +0,0 @@
-package ytdlp
-
-type Subtitle struct {
- Ext string `json:"ext"`
- URL string `json:"url"`
- Name string `json:"name"`
-}
-
-type Video struct {
- ID string `json:"id"`
- Title string `json:"title"`
- Channel string `json:"channel"`
- ChannelID string `json:"channel_id"`
- ChannelURL string `json:"channel_url"`
- Description string `json:"description"`
- Timestamp int `json:"timestamp"`
- Duration float64 `json:"duration"`
- ViewCount float64 `json:"view_count"`
- Tags []string `json:"tags"`
- Categories []string `json:"categories"`
- Thumbnail string `json:"thumbnail"`
- Language string `json:"language"`
- IsLive bool `json:"is_live"`
- Subtitles map[string][]Subtitle `json:"subtitles"`
- RequestedFormats []struct {
- Acodec string `json:"acodec"`
- AspectRatio float64 `json:"aspect_ratio"`
- Asr interface{} `json:"asr"`
- AudioChannels interface{} `json:"audio_channels"`
- AudioExt string `json:"audio_ext"`
- Container string `json:"container"`
- DynamicRange string `json:"dynamic_range"`
- Ext string `json:"ext"`
- Filesize float64 `json:"filesize"`
- Format string `json:"format"`
- FormatID string `json:"format_id"`
- FormatNote string `json:"format_note"`
- Fps float64 `json:"fps"`
- Fragments []struct {
- URL string `json:"url"`
- } `json:"fragments"`
- HasDrm bool `json:"has_drm"`
- Height float64 `json:"height"`
- HTTPHeaders map[string]string `json:"http_headers"`
- Language interface{} `json:"language"`
- LanguagePreference float64 `json:"language_preference"`
- Preference interface{} `json:"preference"`
- Protocol string `json:"protocol"`
- Quality float64 `json:"quality"`
- Resolution string `json:"resolution"`
- SourcePreference float64 `json:"source_preference"`
- Tbr float64 `json:"tbr"`
- URL string `json:"url"`
- Vbr float64 `json:"vbr,omitempty"`
- Vcodec string `json:"vcodec"`
- VideoExt string `json:"video_ext"`
- Width float64 `json:"width"`
- Abr float64 `json:"abr,omitempty"`
- } `json:"requested_formats"`
- Formats []struct {
- Acodec string `json:"acodec"`
- AspectRatio float64 `json:"aspect_ratio"`
- AudioExt string `json:"audio_ext"`
- Columns float64 `json:"columns,omitempty"`
- Ext string `json:"ext"`
- Format string `json:"format"`
- FormatID string `json:"format_id"`
- FormatNote string `json:"format_note"`
- Fps float64 `json:"fps"`
- Fragments []struct {
- Duration float64 `json:"duration"`
- URL string `json:"url"`
- } `json:"fragments,omitempty"`
- Height float64 `json:"height"`
- HTTPHeaders struct {
- Accept string `json:"Accept"`
- AcceptLanguage string `json:"Accept-Language"`
- SecFetchMode string `json:"Sec-Fetch-Mode"`
- UserAgent string `json:"User-Agent"`
- } `json:"http_headers"`
- Protocol string `json:"protocol"`
- Resolution string `json:"resolution"`
- Rows float64 `json:"rows,omitempty"`
- URL string `json:"url"`
- Vcodec string `json:"vcodec"`
- VideoExt string `json:"video_ext"`
- Width float64 `json:"width"`
- Abr float64 `json:"abr,omitempty"`
- Asr float64 `json:"asr,omitempty"`
- AudioChannels float64 `json:"audio_channels,omitempty"`
- Container string `json:"container,omitempty"`
- DynamicRange interface{} `json:"dynamic_range,omitempty"`
- Filesize float64 `json:"filesize,omitempty"`
- HasDrm bool `json:"has_drm,omitempty"`
- Language string `json:"language,omitempty"`
- LanguagePreference float64 `json:"language_preference,omitempty"`
- Preference interface{} `json:"preference,omitempty"`
- Quality float64 `json:"quality,omitempty"`
- SourcePreference float64 `json:"source_preference,omitempty"`
- Tbr float64 `json:"tbr,omitempty"`
- Vbr float64 `json:"vbr,omitempty"`
- FilesizeApprox float64 `json:"filesize_approx,omitempty"`
- } `json:"formats"`
- Thumbnails []struct {
- URL string `json:"url"`
- } `json:"thumbnails"`
-}
-
-type HTTPHeaders struct {
- Accept string `json:"Accept"`
- AcceptLanguage string `json:"Accept-Language"`
- SecFetchMode string `json:"Sec-Fetch-Mode"`
- UserAgent string `json:"User-Agent"`
-}
diff --git a/internal/pkg/crawl/dependencies/ytdlp/parse.go b/internal/pkg/crawl/dependencies/ytdlp/parse.go
deleted file mode 100644
index 1c905a66..00000000
--- a/internal/pkg/crawl/dependencies/ytdlp/parse.go
+++ /dev/null
@@ -1,42 +0,0 @@
-package ytdlp
-
-import (
- "io"
- "net/url"
-)
-
-func Parse(body io.ReadCloser) (streamURLs, metaURLs []*url.URL, rawJSON string, HTTPHeaders map[string]string, err error) {
- // Create a temporary server to serve the body and call ytdlp on it
- port, stopChan, err := serveBody(body)
- if err != nil {
- return streamURLs, metaURLs, rawJSON, HTTPHeaders, err
- }
- defer close(stopChan)
-
- // Call ytdlp on the temporary server
- rawStreamURLs, rawMetaURLs, rawJSON, HTTPHeaders, err := getJSON(port)
- if err != nil {
- return streamURLs, metaURLs, rawJSON, HTTPHeaders, err
- }
-
- // Range over rawStreamURLs and rawMetaURLs to parse them as url.URL in videoURLs and metaURLs
- for _, urlString := range rawStreamURLs {
- URL, err := url.Parse(urlString)
- if err != nil {
- return streamURLs, metaURLs, rawJSON, HTTPHeaders, err
- }
-
- streamURLs = append(streamURLs, URL)
- }
-
- for _, urlString := range rawMetaURLs {
- URL, err := url.Parse(urlString)
- if err != nil {
- return streamURLs, metaURLs, rawJSON, HTTPHeaders, err
- }
-
- metaURLs = append(metaURLs, URL)
- }
-
- return streamURLs, metaURLs, rawJSON, HTTPHeaders, nil
-}
diff --git a/internal/pkg/crawl/dependencies/ytdlp/server.go b/internal/pkg/crawl/dependencies/ytdlp/server.go
deleted file mode 100644
index 4d0e34c4..00000000
--- a/internal/pkg/crawl/dependencies/ytdlp/server.go
+++ /dev/null
@@ -1,46 +0,0 @@
-package ytdlp
-
-import (
- "io"
- "net"
- "net/http"
- "strings"
-)
-
-func serveBody(body io.ReadCloser) (port int, stopChan chan struct{}, err error) {
- stopChan = make(chan struct{})
- portChan := make(chan int)
-
- bodyBytes, err := io.ReadAll(body)
- if err != nil {
- return 0, nil, err
- }
-
- // Start the server
- go func() {
- // Serve the body on the random port
- listener, err := net.Listen("tcp", "127.0.0.1:0")
- if err != nil {
- panic(err)
- }
- defer listener.Close()
-
- portChan <- listener.Addr().(*net.TCPAddr).Port
-
- go func() {
- <-stopChan
- listener.Close()
- }()
-
- // Create a handler that will serve the body on /
- handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
- w.Write(bodyBytes)
- })
-
- if err := http.Serve(listener, handler); err != nil && !strings.Contains(err.Error(), "use of closed network connection") {
- return
- }
- }()
-
- return <-portChan, stopChan, nil
-}
diff --git a/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go b/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go
deleted file mode 100644
index 1d628247..00000000
--- a/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go
+++ /dev/null
@@ -1,95 +0,0 @@
-package ytdlp
-
-import (
- "bytes"
- "encoding/json"
- "fmt"
- "os/exec"
- "strconv"
-)
-
-func getJSON(port int) (streamURLs, metaURLs []string, rawJSON string, HTTPHeaders map[string]string, err error) {
- HTTPHeaders = make(map[string]string)
-
- // Prepare the command
- cmd := exec.Command("yt-dlp", "http://localhost:"+strconv.Itoa(port), "--dump-json", "-f", "bv[protocol=https]+ba[protocol=https]")
-
- // Buffers to capture stdout and stderr
- var stdout, stderr bytes.Buffer
- cmd.Stdout = &stdout
- cmd.Stderr = &stderr
-
- // Run the command
- err = cmd.Run()
- if err != nil {
- return streamURLs, metaURLs, rawJSON, HTTPHeaders, fmt.Errorf("yt-dlp error: %v\nstderr: %s", err, stderr.String())
- }
-
- output := stdout.String()
-
- // Parse the output as a Video object
- var video Video
- err = json.Unmarshal([]byte(output), &video)
- if err != nil {
- return streamURLs, metaURLs, rawJSON, HTTPHeaders, fmt.Errorf("error unmarshaling yt-dlp JSON: %v", err)
- }
-
- // Get the manifest URL for the best video & audio quality
- // Note: we do not archive live streams
- if !video.IsLive {
- if len(video.RequestedFormats) > 0 {
- HTTPHeaders = video.RequestedFormats[0].HTTPHeaders
- for _, format := range video.RequestedFormats {
- // Choose stream_type=
- // If acodec == "none" and vcodec != "none", it's "video"
- // If acodec != "none" and vcodec == "none", it's "audio"
- // If acodec != "none" and vcodec != "none", we don't specify stream_type
- var streamType string
- if format.Acodec == "none" && format.Vcodec != "none" {
- streamType = "video"
- } else if format.Acodec != "none" && format.Vcodec == "none" {
- streamType = "audio"
- }
-
- var URL = format.URL + "&video_id=" + video.ID
- if streamType != "" {
- URL += "&stream_type=" + streamType
- }
-
- streamURLs = append(streamURLs, URL)
- }
- }
- }
-
- // Get all subtitles (not automatic captions)
- for _, subtitle := range video.Subtitles {
- for _, sub := range subtitle {
- metaURLs = append(metaURLs, sub.URL)
- }
- }
-
- // Get all thumbnail URLs
- for _, thumbnail := range video.Thumbnails {
- metaURLs = append(metaURLs, thumbnail.URL)
- }
-
- // Get the storyboards
- for _, format := range video.Formats {
- if format.FormatNote == "storyboard" {
- metaURLs = append(metaURLs, format.URL)
- for _, fragment := range format.Fragments {
- metaURLs = append(metaURLs, fragment.URL)
- }
- }
- }
-
- return streamURLs, metaURLs, output, HTTPHeaders, nil
-}
-
-func FindPath() (string, bool) {
- path, err := exec.LookPath("yt-dlp")
- if err != nil {
- return "", false
- }
- return path, true
-}
diff --git a/internal/pkg/crawl/hq.go b/internal/pkg/crawl/hq.go
deleted file mode 100644
index 8a9b9649..00000000
--- a/internal/pkg/crawl/hq.go
+++ /dev/null
@@ -1,387 +0,0 @@
-package crawl
-
-import (
- "math"
- "net/url"
- "strings"
- "sync"
- "time"
-
- "github.com/internetarchive/Zeno/internal/pkg/queue"
- "github.com/internetarchive/Zeno/internal/pkg/utils"
- "github.com/internetarchive/gocrawlhq"
-)
-
-// This function connects to HQ's websocket and listen for messages.
-// It also sends and "identify" message to the HQ to let it know that
-// Zeno is connected. This "identify" message is sent every second and
-// contains the crawler's stats and details.
-func (c *Crawl) HQWebsocket() {
- var (
- // the "identify" message will be sent every second
- // to the crawl HQ
- identifyTicker = time.NewTicker(time.Second)
- )
-
- defer func() {
- identifyTicker.Stop()
- }()
-
- // send an "identify" message to the crawl HQ every second
- for {
- err := c.HQClient.Identify(&gocrawlhq.IdentifyMessage{
- Project: c.HQProject,
- Job: c.Job,
- IP: utils.GetOutboundIP().String(),
- Hostname: utils.GetHostname(),
- GoVersion: utils.GetVersion().GoVersion,
- })
- if err != nil {
- c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error sending identify payload to crawl HQ, trying to reconnect..")
-
- err = c.HQClient.InitWebsocketConn()
- if err != nil {
- c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error initializing websocket connection to crawl HQ")
- }
- }
-
- <-identifyTicker.C
- }
-}
-
-func (c *Crawl) HQProducer() {
- defer c.HQChannelsWg.Done()
-
- var (
- discoveredArray = []gocrawlhq.URL{}
- mutex = sync.Mutex{}
- terminateProducer = make(chan bool)
- )
-
- // the discoveredArray is sent to the crawl HQ every 10 seconds
- // or when it reaches a certain size
- go func() {
- HQLastSent := time.Now()
-
- for {
- select {
- case <-terminateProducer:
- // no need to lock the mutex here, because the producer channel
- // is already closed, so no other goroutine can write to the slice
- if len(discoveredArray) > 0 {
- for {
- err := c.HQClient.Add(discoveredArray, false)
- if err != nil {
- c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error sending payload to crawl HQ, waiting 1s then retrying..")
- time.Sleep(time.Second)
- continue
- }
- break
- }
- }
-
- return
- default:
- mutex.Lock()
- if (len(discoveredArray) >= int(math.Ceil(float64(c.Workers.Count)/2)) || time.Since(HQLastSent) >= time.Second*10) && len(discoveredArray) > 0 {
- for {
- err := c.HQClient.Add(discoveredArray, false)
- if err != nil {
- c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error sending payload to crawl HQ, waiting 1s then retrying..")
- time.Sleep(time.Second)
- continue
- }
- break
- }
-
- discoveredArray = []gocrawlhq.URL{}
- HQLastSent = time.Now()
- }
- mutex.Unlock()
- }
- }
- }()
-
- // listen to the discovered channel and add the URLs to the discoveredArray
- for discoveredItem := range c.HQProducerChannel {
- var via string
-
- if discoveredItem.ParentURL != nil {
- via = utils.URLToString(discoveredItem.ParentURL)
- }
-
- discoveredURL := gocrawlhq.URL{
- Value: utils.URLToString(discoveredItem.URL),
- Via: via,
- }
-
- for i := uint64(0); i < discoveredItem.Hop; i++ {
- discoveredURL.Path += "L"
- }
-
- // The reason we are using a string instead of a bool is because
- // gob's encode/decode doesn't properly support booleans
- if discoveredItem.BypassSeencheck {
- for {
- err := c.HQClient.Add([]gocrawlhq.URL{discoveredURL}, true)
- if err != nil {
- c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{
- "bypassSeencheck": discoveredItem.BypassSeencheck,
- })).Error("error sending payload to crawl HQ, waiting 1s then retrying..")
- time.Sleep(time.Second)
- continue
- }
- break
- }
- continue
- }
-
- mutex.Lock()
- discoveredArray = append(discoveredArray, discoveredURL)
- mutex.Unlock()
- }
-
- // if we are here, it means that the HQProducerChannel has been closed
- // so we need to send the last payload to the crawl HQ
- terminateProducer <- true
-}
-
-func (c *Crawl) HQConsumer() {
- for {
- c.HQConsumerState = "running"
-
- // This is on purpose evaluated every time,
- // because the value of workers will maybe change
- // during the crawl in the future (to be implemented)
- var HQBatchSize = int(c.Workers.Count)
-
- if c.Finished.Get() {
- c.HQConsumerState = "finished"
- c.Log.Error("crawl finished, stopping HQ consumer")
- break
- }
-
- // If HQContinuousPull is set to true, we will pull URLs from HQ continuously,
- // otherwise we will only pull URLs when needed (and when the crawl is not paused)
- for (c.Queue.GetStats().TotalElements > HQBatchSize && !c.HQContinuousPull) || c.Paused.Get() || c.Queue.HandoverOpen.Get() {
- c.HQConsumerState = "waiting"
- c.Log.Info("HQ producer waiting", "paused", c.Paused.Get(), "handoverOpen", c.Queue.HandoverOpen.Get(), "queueSize", c.Queue.GetStats().TotalElements)
- time.Sleep(time.Millisecond * 50)
- continue
- }
-
- // If a specific HQ batch size is set, use it
- if c.HQBatchSize != 0 {
- HQBatchSize = c.HQBatchSize
- }
-
- // get batch from crawl HQ
- c.HQConsumerState = "waitingOnFeed"
- var URLs []gocrawlhq.URL
- var err error
- if c.HQBatchConcurrency == 1 {
- URLs, err = c.HQClient.Get(HQBatchSize, c.HQStrategy)
- if err != nil {
- // c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{
- // "batchSize": HQBatchSize,
- // "err": err,
- // })).Debug("error getting new URLs from crawl HQ")
- continue
- }
- } else {
- var mu sync.Mutex
- var wg sync.WaitGroup
- batchSize := HQBatchSize / c.HQBatchConcurrency
- URLsChan := make(chan []gocrawlhq.URL, c.HQBatchConcurrency)
-
- // Start goroutines to get URLs from crawl HQ, each will request
- // HQBatchSize / HQConcurrentBatch URLs
- for i := 0; i < c.HQBatchConcurrency; i++ {
- wg.Add(1)
- go func() {
- defer wg.Done()
- URLs, err := c.HQClient.Get(batchSize, c.HQStrategy)
- if err != nil {
- // c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{
- // "batchSize": batchSize,
- // "err": err,
- // })).Debug("error getting new URLs from crawl HQ")
- return
- }
- URLsChan <- URLs
- }()
- }
-
- // Wait for all goroutines to finish
- go func() {
- wg.Wait()
- close(URLsChan)
- }()
-
- // Collect all URLs from the channels
- for URLsFromChan := range URLsChan {
- mu.Lock()
- URLs = append(URLs, URLsFromChan...)
- mu.Unlock()
- }
- }
- c.HQConsumerState = "feedCompleted"
-
- // send all URLs received in the batch to the queue
- var items = make([]*queue.Item, 0, len(URLs))
- if len(URLs) > 0 {
- for _, URL := range URLs {
- c.HQConsumerState = "urlParse"
- newURL, err := url.Parse(URL.Value)
- if err != nil {
- c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{
- "url": URL.Value,
- "batchSize": HQBatchSize,
- "err": err,
- })).Error("unable to parse URL received from crawl HQ, discarding")
- continue
- }
-
- c.HQConsumerState = "newItem"
- newItem, err := queue.NewItem(newURL, nil, "seed", uint64(strings.Count(URL.Path, "L")), URL.ID, false)
- if err != nil {
- c.Log.WithFields(c.genLogFields(err, newURL, map[string]interface{}{
- "url": URL.Value,
- "batchSize": HQBatchSize,
- "err": err,
- })).Error("unable to create new item from URL received from crawl HQ, discarding")
- continue
- }
-
- c.HQConsumerState = "append"
- items = append(items, newItem)
- }
- }
-
- c.HQConsumerState = "enqueue"
- err = c.Queue.BatchEnqueue(items...)
- if err != nil {
- c.Log.Error("unable to enqueue URL batch received from crawl HQ, discarding", "error", err)
- continue
- }
- }
-}
-
-func (c *Crawl) HQFinisher() {
- defer c.HQChannelsWg.Done()
-
- var (
- finishedArray = []gocrawlhq.URL{}
- locallyCrawledTotal int
- )
-
- for finishedItem := range c.HQFinishedChannel {
- if finishedItem.ID == "" {
- c.Log.WithFields(c.genLogFields(nil, finishedItem.URL, nil)).Warn("URL has no ID, discarding")
- continue
- }
-
- locallyCrawledTotal += int(finishedItem.LocallyCrawled)
- finishedArray = append(finishedArray, gocrawlhq.URL{ID: finishedItem.ID, Value: utils.URLToString(finishedItem.URL)})
-
- if len(finishedArray) == int(math.Ceil(float64(c.Workers.Count)/2)) {
- for {
- err := c.HQClient.Delete(finishedArray, locallyCrawledTotal)
- if err != nil {
- c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{
- "finishedArray": finishedArray,
- })).Error("error submitting finished urls to crawl HQ. retrying in one second...")
- time.Sleep(time.Second)
- continue
- }
- break
- }
-
- finishedArray = []gocrawlhq.URL{}
- locallyCrawledTotal = 0
- }
- }
-
- // send remaining finished URLs
- if len(finishedArray) > 0 {
- for {
- err := c.HQClient.Delete(finishedArray, locallyCrawledTotal)
- if err != nil {
- c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{
- "finishedArray": finishedArray,
- })).Error("error submitting finished urls to crawl HQ. retrying in one second...")
- time.Sleep(time.Second)
- continue
- }
- break
- }
- }
-}
-
-func (c *Crawl) HQSeencheckURLs(URLs []*url.URL) (seencheckedBatch []*url.URL, err error) {
- var (
- discoveredURLs []gocrawlhq.URL
- )
-
- for _, URL := range URLs {
- discoveredURLs = append(discoveredURLs, gocrawlhq.URL{
- Value: utils.URLToString(URL),
- Type: "asset",
- })
- }
-
- outputURLs, err := c.HQClient.Seencheck(discoveredURLs)
- if err != nil {
- c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{
- "batchLen": len(URLs),
- "urls": discoveredURLs,
- })).Error("error sending seencheck payload to crawl HQ")
- return seencheckedBatch, err
- }
-
- if outputURLs != nil {
- for _, URL := range outputURLs {
- // the returned payload only contain new URLs to be crawled by Zeno
- newURL, err := url.Parse(URL.Value)
- if err != nil {
- c.Log.WithFields(c.genLogFields(err, URL, map[string]interface{}{
- "batchLen": len(URLs),
- })).Error("error parsing URL from HQ seencheck response")
- return seencheckedBatch, err
- }
-
- seencheckedBatch = append(seencheckedBatch, newURL)
- }
- }
-
- return seencheckedBatch, nil
-}
-
-// returns:
-// - bool: true if the URL is new, false if it has been seen before
-// - error: if there's an error sending the payload to crawl HQ
-//
-// NOTE: if there's an error, the URL is considered new
-func (c *Crawl) HQSeencheckURL(URL *url.URL) (bool, error) {
- discoveredURL := gocrawlhq.URL{
- Value: utils.URLToString(URL),
- Type: "asset",
- }
-
- outputURLs, err := c.HQClient.Seencheck([]gocrawlhq.URL{discoveredURL})
- if err != nil {
- c.Log.Error("error sending seencheck payload to crawl HQ", "err", err, "url", utils.URLToString(URL))
- return true, err // return true, don't discard the URL if there's an error
- }
-
- if outputURLs != nil {
- for _, URL := range outputURLs {
- // the returned payload only contain new URLs to be crawled by Zeno
- if URL.Value == discoveredURL.Value {
- return true, nil
- }
- }
- }
-
- return false, nil
-}
diff --git a/internal/pkg/crawl/link_header_test.go b/internal/pkg/crawl/link_header_test.go
deleted file mode 100644
index 2d165721..00000000
--- a/internal/pkg/crawl/link_header_test.go
+++ /dev/null
@@ -1,100 +0,0 @@
-package crawl
-
-import (
- "slices"
- "testing"
-)
-
-func TestParseOneLink(t *testing.T) {
- var links []Link
- links = append(links, Link{URL: "https://one.example.com", Rel: "preconnect"})
-
- var link = `; rel="preconnect"`
-
- got := Parse(link)
- want := links
-
- if !slices.Equal(got, want) {
- t.Fatalf("got %q, wanted %q", got, want)
- }
-}
-
-func TestParseMultipleLinks(t *testing.T) {
- var links []Link
- links = append(links,
- Link{URL: "https://test.com", Rel: "preconnect"},
- Link{URL: "https://app.test.com", Rel: "preconnect"},
- Link{URL: "https://example.com", Rel: "preconnect"},
- )
-
- var link = `; rel="preconnect", ; rel="preconnect"; foo="bar", ; rel="preconnect"`
-
- got := Parse(link)
- want := links
-
- if !slices.Equal(got, want) {
- t.Fatalf("got %q, wanted %q", got, want)
- }
-}
-
-func TestParseOneMalformedLink(t *testing.T) {
- var links []Link
- links = append(links, Link{URL: "https://one.example.com", Rel: "preconnect"})
-
- var link = `https://one.example.com>;; rel=preconnect";`
-
- got := Parse(link)
- want := links
-
- if !slices.Equal(got, want) {
- t.Fatalf("got %q, wanted %q", got, want)
- }
-}
-
-func TestParseMultipleMalformedLinks(t *testing.T) {
- var links []Link
- links = append(links,
- Link{URL: "", Rel: "preconnect"},
- Link{URL: "https://app.test.com", Rel: ""},
- Link{URL: "", Rel: ""},
- )
-
- var link = `; rel="preconnect", https://app.test.com; rel=""; "bar", <>; ="preconnect"`
-
- got := Parse(link)
- want := links
-
- if !slices.Equal(got, want) {
- t.Fatalf("got %q, wanted %q", got, want)
- }
-}
-
-func TestParseAttr(t *testing.T) {
- attr := `rel="preconnect"`
-
- gotKey, gotValue := ParseAttr(attr)
- wantKey, wantValue := "rel", "preconnect"
-
- if gotKey != wantKey {
- t.Fatalf("got %q, wanted %q", gotKey, wantKey)
- }
-
- if gotValue != wantValue {
- t.Fatalf("got %q, wanted %q", gotValue, wantValue)
- }
-}
-
-func TestParseMalformedAttr(t *testing.T) {
- attr := `="preconnect"`
-
- gotKey, gotValue := ParseAttr(attr)
- wantKey, wantValue := "", "preconnect"
-
- if gotKey != wantKey {
- t.Fatalf("got %q, wanted %q", gotKey, wantKey)
- }
-
- if gotValue != wantValue {
- t.Fatalf("got %q, wanted %q", gotValue, wantValue)
- }
-}
diff --git a/internal/pkg/crawl/sitespecific/cloudflarestream/cloudflarestream.go b/internal/pkg/crawl/sitespecific/cloudflarestream/cloudflarestream.go
deleted file mode 100644
index 8951b9ee..00000000
--- a/internal/pkg/crawl/sitespecific/cloudflarestream/cloudflarestream.go
+++ /dev/null
@@ -1,454 +0,0 @@
-package cloudflarestream
-
-import (
- "encoding/xml"
- "errors"
- "io"
- "io/ioutil"
- "math"
- "net/url"
- "strconv"
- "strings"
-
- "github.com/CorentinB/warc"
- "github.com/PuerkitoBio/goquery"
- "github.com/internetarchive/Zeno/internal/pkg/utils"
-)
-
-type MPD struct {
- XMLName xml.Name `xml:"MPD"`
- Text string `xml:",chardata"`
- Xmlns string `xml:"xmlns,attr"`
- Profiles string `xml:"profiles,attr"`
- Type string `xml:"type,attr"`
- MediaPresentationDuration string `xml:"mediaPresentationDuration,attr"`
- MinBufferTime string `xml:"minBufferTime,attr"`
- Period struct {
- Text string `xml:",chardata"`
- ID string `xml:"id,attr"`
- AdaptationSet []struct {
- Text string `xml:",chardata"`
- ID string `xml:"id,attr"`
- MimeType string `xml:"mimeType,attr"`
- SegmentAlignment string `xml:"segmentAlignment,attr"`
- Lang string `xml:"lang,attr"`
- Representation []struct {
- Text string `xml:",chardata"`
- ID string `xml:"id,attr"`
- AudioSamplingRate string `xml:"audioSamplingRate,attr"`
- Bandwidth string `xml:"bandwidth,attr"`
- Codecs string `xml:"codecs,attr"`
- FrameRate string `xml:"frameRate,attr"`
- Height string `xml:"height,attr"`
- Width string `xml:"width,attr"`
- AudioChannelConfiguration struct {
- Text string `xml:",chardata"`
- SchemeIdUri string `xml:"schemeIdUri,attr"`
- Value string `xml:"value,attr"`
- } `xml:"AudioChannelConfiguration"`
- SegmentTemplate struct {
- Text string `xml:",chardata"`
- Duration string `xml:"duration,attr"`
- Initialization string `xml:"initialization,attr"`
- Media string `xml:"media,attr"`
- StartNumber string `xml:"startNumber,attr"`
- Timescale string `xml:"timescale,attr"`
- } `xml:"SegmentTemplate"`
- } `xml:"Representation"`
- } `xml:"AdaptationSet"`
- } `xml:"Period"`
-}
-
-func IsURL(URL string) bool {
- return strings.Contains(URL, "cloudflarestream.com")
-}
-
-func GetJSFiles(doc *goquery.Document, watchPageURL *url.URL, httpClient warc.CustomHTTPClient) (archivedURLs []string, err error) {
- var latestJSURL string
-
- // Look for the $10,000 Every Day You Survive In A Grocery Store - YouTube
PrésentationPresseDroits d'auteurNous contacterCréateursPublicitéDéveloppeursRésilier vos abonnementsConditions d'utilisationConfidentialitéRègles et sécuritéPremiers pas sur YouTubeTester de nouvelles fonctionnalités