diff --git a/.gitignore b/.gitignore index 98455fe3..28459922 100644 --- a/.gitignore +++ b/.gitignore @@ -2,8 +2,12 @@ jobs/* jobs/ Zeno *.txt -*.sh +*.sh* zeno.log .vscode/ *.py -.DS_Store \ No newline at end of file +.DS_Store +ZENO-* +output.log +.old/ +*.warc.* \ No newline at end of file diff --git a/.old/go.mod b/.old/go.mod new file mode 100644 index 00000000..1200c6fa --- /dev/null +++ b/.old/go.mod @@ -0,0 +1,95 @@ +module github.com/internetarchive/Zeno + +go 1.22.4 + +require ( + github.com/CorentinB/warc v0.8.53 + github.com/PuerkitoBio/goquery v1.9.3 + github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 + github.com/clbanning/mxj/v2 v2.7.0 + github.com/dustin/go-humanize v1.0.1 + github.com/elastic/go-elasticsearch/v8 v8.15.0 + github.com/google/uuid v1.6.0 + github.com/gosuri/uilive v0.0.4 + github.com/gosuri/uitable v0.0.4 + github.com/grafov/m3u8 v0.12.0 + github.com/internetarchive/gocrawlhq v1.2.20 + github.com/paulbellamy/ratecounter v0.2.0 + github.com/philippgille/gokv/leveldb v0.7.0 + github.com/prometheus/client_golang v1.20.4 + github.com/remeh/sizedwaitgroup v1.0.0 + github.com/sirupsen/logrus v1.9.3 + github.com/spf13/cobra v1.8.1 + github.com/spf13/pflag v1.0.5 + github.com/spf13/viper v1.19.0 + github.com/telanflow/cookiejar v0.0.0-20190719062046-114449e86aa5 + go.uber.org/goleak v1.3.0 + golang.org/x/net v0.29.0 + google.golang.org/protobuf v1.34.2 + mvdan.cc/xurls/v2 v2.5.0 +) + +require ( + github.com/andybalholm/brotli v1.1.0 // indirect + github.com/andybalholm/cascadia v1.3.2 // indirect + github.com/aws/aws-sdk-go v1.55.5 // indirect + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/cloudflare/circl v1.4.0 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect + github.com/elastic/elastic-transport-go/v8 v8.6.0 // indirect + github.com/fatih/color v1.17.0 // indirect + github.com/fsnotify/fsnotify v1.7.0 // indirect + github.com/go-logr/logr v1.4.2 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/gobwas/httphead v0.1.0 // indirect + github.com/gobwas/pool v0.2.1 // indirect + github.com/gobwas/ws v1.4.0 // indirect + github.com/golang/snappy v0.0.4 // indirect + github.com/gomodule/redigo v1.9.2 // indirect + github.com/google/go-cmp v0.6.0 // indirect + github.com/hashicorp/hcl v1.0.0 // indirect + github.com/inconshreveable/mousetrap v1.1.0 // indirect + github.com/jmespath/go-jmespath v0.4.0 // indirect + github.com/json-iterator/go v1.1.12 // indirect + github.com/klauspost/compress v1.17.10 // indirect + github.com/magiconair/properties v1.8.7 // indirect + github.com/mattn/go-colorable v0.1.13 // indirect + github.com/mattn/go-isatty v0.0.20 // indirect + github.com/mattn/go-runewidth v0.0.16 // indirect + github.com/miekg/dns v1.1.62 // indirect + github.com/mitchellh/mapstructure v1.5.0 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/onsi/gomega v1.34.2 // indirect + github.com/pelletier/go-toml/v2 v2.2.3 // indirect + github.com/philippgille/gokv/encoding v0.7.0 // indirect + github.com/philippgille/gokv/util v0.7.0 // indirect + github.com/prometheus/client_model v0.6.1 // indirect + github.com/prometheus/common v0.59.1 // indirect + github.com/prometheus/procfs v0.15.1 // indirect + github.com/refraction-networking/utls v1.6.7 // indirect + github.com/rivo/uniseg v0.4.7 // indirect + github.com/sagikazarmark/locafero v0.6.0 // indirect + github.com/sagikazarmark/slog-shim v0.1.0 // indirect + github.com/sourcegraph/conc v0.3.0 // indirect + github.com/spf13/afero v1.11.0 // indirect + github.com/spf13/cast v1.7.0 // indirect + github.com/subosito/gotenv v1.6.0 // indirect + github.com/syndtr/goleveldb v1.0.0 // indirect + github.com/ulikunitz/xz v0.5.12 // indirect + go.opentelemetry.io/otel v1.30.0 // indirect + go.opentelemetry.io/otel/metric v1.30.0 // indirect + go.opentelemetry.io/otel/trace v1.30.0 // indirect + go.uber.org/multierr v1.11.0 // indirect + golang.org/x/crypto v0.27.0 // indirect + golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 // indirect + golang.org/x/mod v0.21.0 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/sys v0.26.0 // indirect + golang.org/x/text v0.18.0 // indirect + golang.org/x/tools v0.25.0 // indirect + gopkg.in/ini.v1 v1.67.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/.old/go.sum b/.old/go.sum new file mode 100644 index 00000000..a4f66821 --- /dev/null +++ b/.old/go.sum @@ -0,0 +1,291 @@ +git.archive.org/wb/gocrawlhq v1.2.13 h1:PqEhgtYqNEUWO2JEJUHmXT+nIwW9LRgb4ocUFANciQo= +git.archive.org/wb/gocrawlhq v1.2.13/go.mod h1:JQIKgebFmpbxmEalNRjID3RwCxHkslt3PHAnum82KtM= +github.com/CorentinB/warc v0.8.52 h1:k6lkq3uh6PkhZG+WKpPEkeQPmO1byb7MnSZaNT28SH4= +github.com/CorentinB/warc v0.8.52/go.mod h1:NblONkMtoBB4TIigew6F6vakzu0z3YQTKNFS8U2FIn8= +github.com/CorentinB/warc v0.8.53 h1:xVz3RMdZ6faAqTtLfcK1/yl8ZTansy+B2en//EZLUlM= +github.com/CorentinB/warc v0.8.53/go.mod h1:NblONkMtoBB4TIigew6F6vakzu0z3YQTKNFS8U2FIn8= +github.com/PuerkitoBio/goquery v1.9.3 h1:mpJr/ikUA9/GNJB/DBZcGeFDXUtosHRyRrwh7KGdTG0= +github.com/PuerkitoBio/goquery v1.9.3/go.mod h1:1ndLHPdTz+DyQPICCWYlYQMPl0oXZj0G6D4LCYA6u4U= +github.com/PuerkitoBio/goquery v1.10.0 h1:6fiXdLuUvYs2OJSvNRqlNPoBm6YABE226xrbavY5Wv4= +github.com/PuerkitoBio/goquery v1.10.0/go.mod h1:TjZZl68Q3eGHNBA8CWaxAN7rOU1EbDz3CWuolcO5Yu4= +github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= +github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY= +github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= +github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= +github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 h1:DklsrG3dyBCFEj5IhUbnKptjxatkF07cF2ak3yi77so= +github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw= +github.com/aws/aws-sdk-go v1.55.5 h1:KKUZBfBoyqy5d3swXyiC7Q76ic40rYcbqH7qjh59kzU= +github.com/aws/aws-sdk-go v1.55.5/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/clbanning/mxj/v2 v2.7.0 h1:WA/La7UGCanFe5NpHF0Q3DNtnCsVoxbPKuyBNHWRyME= +github.com/clbanning/mxj/v2 v2.7.0/go.mod h1:hNiWqW14h+kc+MdF9C6/YoRfjEJoR3ou6tn/Qo+ve2s= +github.com/cloudflare/circl v1.4.0 h1:BV7h5MgrktNzytKmWjpOtdYrf0lkkbF8YMlBGPhJQrY= +github.com/cloudflare/circl v1.4.0/go.mod h1:PDRU+oXvdD7KCtgKxW95M5Z8BpSCJXQORiZFnBQS5QU= +github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= +github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= +github.com/elastic/elastic-transport-go/v8 v8.6.0 h1:Y2S/FBjx1LlCv5m6pWAF2kDJAHoSjSRSJCApolgfthA= +github.com/elastic/elastic-transport-go/v8 v8.6.0/go.mod h1:YLHer5cj0csTzNFXoNQ8qhtGY1GTvSqPnKWKaqQE3Hk= +github.com/elastic/go-elasticsearch/v8 v8.15.0 h1:IZyJhe7t7WI3NEFdcHnf6IJXqpRf+8S8QWLtZYYyBYk= +github.com/elastic/go-elasticsearch/v8 v8.15.0/go.mod h1:HCON3zj4btpqs2N1jjsAy4a/fiAul+YBP00mBH4xik8= +github.com/fatih/color v1.17.0 h1:GlRw1BRJxkpqUCBKzKOw098ed57fEsKeNjpTe3cSjK4= +github.com/fatih/color v1.17.0/go.mod h1:YZ7TlrGPkiz6ku9fK3TLD/pl3CpsiFyu8N92HLgmosI= +github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= +github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= +github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= +github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= +github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= +github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/go-test/deep v1.1.0 h1:WOcxcdHcvdgThNXjw0t76K42FXTU7HpNQWHpA2HHNlg= +github.com/go-test/deep v1.1.0/go.mod h1:5C2ZWiW0ErCdrYzpqxLbTX7MG14M9iiw8DgHncVwcsE= +github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU= +github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM= +github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og= +github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw= +github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs= +github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc= +github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= +github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/gomodule/redigo v1.9.2 h1:HrutZBLhSIU8abiSfW8pj8mPhOyMYjZT/wcA4/L9L9s= +github.com/gomodule/redigo v1.9.2/go.mod h1:KsU3hiK/Ay8U42qpaJk+kuNa3C+spxapWpM+ywhcgtw= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/gosuri/uilive v0.0.4 h1:hUEBpQDj8D8jXgtCdBu7sWsy5sbW/5GhuO8KBwJ2jyY= +github.com/gosuri/uilive v0.0.4/go.mod h1:V/epo5LjjlDE5RJUcqx8dbw+zc93y5Ya3yg8tfZ74VI= +github.com/gosuri/uitable v0.0.4 h1:IG2xLKRvErL3uhY6e1BylFzG+aJiwQviDDTfOKeKTpY= +github.com/gosuri/uitable v0.0.4/go.mod h1:tKR86bXuXPZazfOTG1FIzvjIdXzd0mo4Vtn16vt0PJo= +github.com/grafov/m3u8 v0.12.0 h1:T6iTwTsSEtMcwkayef+FJO8kj+Sglr4Lh81Zj8Ked/4= +github.com/grafov/m3u8 v0.12.0/go.mod h1:nqzOkfBiZJENr52zTVd/Dcl03yzphIMbJqkXGu+u080= +github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= +github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= +github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI= +github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= +github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= +github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= +github.com/internetarchive/gocrawlhq v1.2.13 h1:ALfUrWR7nRez5gWhHRJ7ZklIpGMjERGMUJqR4HBl4+8= +github.com/internetarchive/gocrawlhq v1.2.13/go.mod h1:JQIKgebFmpbxmEalNRjID3RwCxHkslt3PHAnum82KtM= +github.com/internetarchive/gocrawlhq v1.2.14 h1:g3MPMonpA6mTkCpjBvW3paeBHiH+gGgwSvkyX/lxu7s= +github.com/internetarchive/gocrawlhq v1.2.14/go.mod h1:IOHVfWsptADzh+r2J+UnSm22EB9r8TiVVeAuP9WRFoc= +github.com/internetarchive/gocrawlhq v1.2.15 h1:Llv6tvxxRUxoC9G4GsjkpbfKX0anbQUU+pwFiROlxzg= +github.com/internetarchive/gocrawlhq v1.2.15/go.mod h1:Rjkyx2ttWDG4vzXOrl7ilzdtbODJ3XSe2PkO77bxSTs= +github.com/internetarchive/gocrawlhq v1.2.16 h1:D9JJdLL8uqpHUDU3SxxcXUjQETbxnk08e9xo929xrlE= +github.com/internetarchive/gocrawlhq v1.2.16/go.mod h1:Rjkyx2ttWDG4vzXOrl7ilzdtbODJ3XSe2PkO77bxSTs= +github.com/internetarchive/gocrawlhq v1.2.17 h1:nSjFHpDp5C9Q8SrDPibC4Iiih6kpw18+2GnifJiVpO0= +github.com/internetarchive/gocrawlhq v1.2.17/go.mod h1:Rjkyx2ttWDG4vzXOrl7ilzdtbODJ3XSe2PkO77bxSTs= +github.com/internetarchive/gocrawlhq v1.2.18 h1:PPe7UqJ2NNOljn70SmUhoKdgPreeqRUk9XVrYShCn4w= +github.com/internetarchive/gocrawlhq v1.2.18/go.mod h1:Rjkyx2ttWDG4vzXOrl7ilzdtbODJ3XSe2PkO77bxSTs= +github.com/internetarchive/gocrawlhq v1.2.19 h1:bvDliaeWjt97x64bOf+rKXStQX7VE+ZON/I1FS3sQ6A= +github.com/internetarchive/gocrawlhq v1.2.19/go.mod h1:gHrdMewIi5OBWE/xEZGqSrNHyTXPbt+h+XUWpp9fZek= +github.com/internetarchive/gocrawlhq v1.2.20 h1:0mIIt9lhPacKr6L2JeISoopQ8EgzC3dISJ3ITGGbOp4= +github.com/internetarchive/gocrawlhq v1.2.20/go.mod h1:gHrdMewIi5OBWE/xEZGqSrNHyTXPbt+h+XUWpp9fZek= +github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= +github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= +github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/klauspost/compress v1.17.10 h1:oXAz+Vh0PMUvJczoi+flxpnBEPxoER1IaAnU/NMPtT0= +github.com/klauspost/compress v1.17.10/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= +github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= +github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= +github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= +github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= +github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= +github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= +github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc= +github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= +github.com/miekg/dns v1.1.62 h1:cN8OuEF1/x5Rq6Np+h1epln8OiyPWV+lROx9LxcGgIQ= +github.com/miekg/dns v1.1.62/go.mod h1:mvDlcItzm+br7MToIKqkglaGhlFMHJ9DTNNWONWXbNQ= +github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= +github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= +github.com/onsi/ginkgo v1.7.0 h1:WSHQ+IS43OoUrWtD1/bbclrwK8TTH5hzp+umCiuxHgs= +github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= +github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= +github.com/onsi/gomega v1.34.2 h1:pNCwDkzrsv7MS9kpaQvVb1aVLahQXyJ/Tv5oAZMI3i8= +github.com/onsi/gomega v1.34.2/go.mod h1:v1xfxRgk0KIsG+QOdm7p8UosrOzPYRo60fd3B/1Dukc= +github.com/paulbellamy/ratecounter v0.2.0 h1:2L/RhJq+HA8gBQImDXtLPrDXK5qAj6ozWVK/zFXVJGs= +github.com/paulbellamy/ratecounter v0.2.0/go.mod h1:Hfx1hDpSGoqxkVVpBi/IlYD7kChlfo5C6hzIHwPqfFE= +github.com/pelletier/go-toml/v2 v2.2.3 h1:YmeHyLY8mFWbdkNWwpr+qIL2bEqT0o95WSdkNHvL12M= +github.com/pelletier/go-toml/v2 v2.2.3/go.mod h1:MfCQTFTvCcUyyvvwm1+G6H/jORL20Xlb6rzQu9GuUkc= +github.com/philippgille/gokv v0.7.0 h1:rQSIQspete82h78Br7k7rKUZ8JYy/hWlwzm/W5qobPI= +github.com/philippgille/gokv v0.7.0/go.mod h1:OwiTP/3bhEBhSuOmFmq1+rszglfSgjJVxd1HOgOa2N4= +github.com/philippgille/gokv/encoding v0.7.0 h1:2oxepKzzTsi00iLZBCZ7Rmqrallh9zws3iqSrLGfkgo= +github.com/philippgille/gokv/encoding v0.7.0/go.mod h1:yncOBBUciyniPI8t5ECF8XSCwhONE9Rjf3My5IHs3fA= +github.com/philippgille/gokv/leveldb v0.7.0 h1:QTH83utBG8knTTFzO1tIF1amKIjz9xxOPLaZrU48kdQ= +github.com/philippgille/gokv/leveldb v0.7.0/go.mod h1:EE/vyZ5VwPPWwZHKJYWd/rkqUIJXFykKA5eluazFByc= +github.com/philippgille/gokv/test v0.7.0 h1:0wBKnKaFZlSeHxLXcmUJqK//IQGUMeu+o8B876KCiOM= +github.com/philippgille/gokv/test v0.7.0/go.mod h1:TP/VzO/qAoi6njsfKnRpXKno0hRuzD5wsLnHhtUcVkY= +github.com/philippgille/gokv/util v0.7.0 h1:5avUK/a3aSj/aWjhHv4/FkqgMon2B7k2BqFgLcR+DYg= +github.com/philippgille/gokv/util v0.7.0/go.mod h1:i9KLHbPxGiHLMhkix/CcDQhpPbCkJy5BkW+RKgwDHMo= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.20.4 h1:Tgh3Yr67PaOv/uTqloMsCEdeuFTatm5zIq5+qNN23vI= +github.com/prometheus/client_golang v1.20.4/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= +github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= +github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= +github.com/prometheus/common v0.59.1 h1:LXb1quJHWm1P6wq/U824uxYi4Sg0oGvNeUm1z5dJoX0= +github.com/prometheus/common v0.59.1/go.mod h1:GpWM7dewqmVYcd7SmRaiWVe9SSqjf0UrwnYnpEZNuT0= +github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= +github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= +github.com/refraction-networking/utls v1.6.7 h1:zVJ7sP1dJx/WtVuITug3qYUq034cDq9B2MR1K67ULZM= +github.com/refraction-networking/utls v1.6.7/go.mod h1:BC3O4vQzye5hqpmDTWUqi4P5DDhzJfkV1tdqtawQIH0= +github.com/remeh/sizedwaitgroup v1.0.0 h1:VNGGFwNo/R5+MJBf6yrsr110p0m4/OX4S3DCy7Kyl5E= +github.com/remeh/sizedwaitgroup v1.0.0/go.mod h1:3j2R4OIe/SeS6YDhICBy22RWjJC5eNCJ1V+9+NVNYlo= +github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= +github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= +github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= +github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= +github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= +github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/sagikazarmark/locafero v0.6.0 h1:ON7AQg37yzcRPU69mt7gwhFEBwxI6P9T4Qu3N51bwOk= +github.com/sagikazarmark/locafero v0.6.0/go.mod h1:77OmuIc6VTraTXKXIs/uvUxKGUXjE1GbemJYHqdNjX0= +github.com/sagikazarmark/slog-shim v0.1.0 h1:diDBnUNK9N/354PgrxMywXnAwEr1QZcOr6gto+ugjYE= +github.com/sagikazarmark/slog-shim v0.1.0/go.mod h1:SrcSrq8aKtyuqEI1uvTDTK1arOWRIczQRv+GVI1AkeQ= +github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= +github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/sourcegraph/conc v0.3.0 h1:OQTbbt6P72L20UqAkXXuLOj79LfEanQ+YQFNpLA9ySo= +github.com/sourcegraph/conc v0.3.0/go.mod h1:Sdozi7LEKbFPqYX2/J+iBAM6HpqSLTASQIKqDmF7Mt0= +github.com/spf13/afero v1.11.0 h1:WJQKhtpdm3v2IzqG8VMqrr6Rf3UYpEF239Jy9wNepM8= +github.com/spf13/afero v1.11.0/go.mod h1:GH9Y3pIexgf1MTIWtNGyogA5MwRIDXGUr+hbWNoBjkY= +github.com/spf13/cast v1.7.0 h1:ntdiHjuueXFgm5nzDRdOS4yfT43P5Fnud6DH50rz/7w= +github.com/spf13/cast v1.7.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo= +github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= +github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= +github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= +github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/viper v1.19.0 h1:RWq5SEjt8o25SROyN3z2OrDB9l7RPd3lwTWU8EcEdcI= +github.com/spf13/viper v1.19.0/go.mod h1:GQUN9bilAbhU/jgc1bKs99f/suXKeUMct8Adx5+Ntkg= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8= +github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU= +github.com/syndtr/goleveldb v1.0.0 h1:fBdIW9lB4Iz0n9khmH8w27SJ3QEJ7+IgjPEwGSZiFdE= +github.com/syndtr/goleveldb v1.0.0/go.mod h1:ZVVdQEZoIme9iO1Ch2Jdy24qqXrMMOU6lpPAyBWyWuQ= +github.com/telanflow/cookiejar v0.0.0-20190719062046-114449e86aa5 h1:gTQl5nPlc9B53vFOKM8aJHwxB2BW2kM49PVR5526GBg= +github.com/telanflow/cookiejar v0.0.0-20190719062046-114449e86aa5/go.mod h1:qNgA5MKwTh103SxGTooqZMiKxZTaV9UV3KjN7I7Drig= +github.com/ulikunitz/xz v0.5.12 h1:37Nm15o69RwBkXM0J6A5OlE67RZTfzUxTj8fB3dfcsc= +github.com/ulikunitz/xz v0.5.12/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +go.opentelemetry.io/otel v1.30.0 h1:F2t8sK4qf1fAmY9ua4ohFS/K+FUuOPemHUIXHtktrts= +go.opentelemetry.io/otel v1.30.0/go.mod h1:tFw4Br9b7fOS+uEao81PJjVMjW/5fvNCbpsDIXqP0pc= +go.opentelemetry.io/otel/metric v1.30.0 h1:4xNulvn9gjzo4hjg+wzIKG7iNFEaBMX00Qd4QIZs7+w= +go.opentelemetry.io/otel/metric v1.30.0/go.mod h1:aXTfST94tswhWEb+5QjlSqG+cZlmyXy/u8jFpor3WqQ= +go.opentelemetry.io/otel/sdk v1.21.0 h1:FTt8qirL1EysG6sTQRZ5TokkU8d0ugCj8htOgThZXQ8= +go.opentelemetry.io/otel/sdk v1.21.0/go.mod h1:Nna6Yv7PWTdgJHVRD9hIYywQBRx7pbox6nwBnZIxl/E= +go.opentelemetry.io/otel/trace v1.30.0 h1:7UBkkYzeg3C7kQX8VAidWh2biiQbtAKjyIML8dQ9wmc= +go.opentelemetry.io/otel/trace v1.30.0/go.mod h1:5EyKqTzzmyqB9bwtCCq6pDLktPK6fmGf/Dph+8VI02o= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= +go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.27.0 h1:GXm2NjJrPaiv/h1tb2UH8QfgC/hOf/+z0p6PT8o1w7A= +golang.org/x/crypto v0.27.0/go.mod h1:1Xngt8kV6Dvbssa53Ziq6Eqn0HqbZi5Z6R0ZpwQzt70= +golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 h1:e66Fs6Z+fZTbFBAxKfP3PALWBtpfqks2bwGcexMxgtk= +golang.org/x/exp v0.0.0-20240909161429-701f63a606c0/go.mod h1:2TbTHSBQa924w8M6Xs1QcRcFwyucIwBGpK1p2f1YFFY= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.21.0 h1:vvrHzRwRfVKSiLrG+d4FMl/Qi4ukBCE6kZlTUkDYRT0= +golang.org/x/mod v0.21.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= +golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= +golang.org/x/net v0.29.0 h1:5ORfpBpCs4HzDYoodCDBbwHzdR5UrLBZ3sOnUJmFoHo= +golang.org/x/net v0.29.0/go.mod h1:gLkgy8jTGERgjzMic6DS9+SP0ajcu6Xu3Orq/SpETg0= +golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34= +golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= +golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.18.0 h1:XvMDiNzPAl0jr17s6W9lcaIhGUfUORdGCNsuLmPG224= +golang.org/x/text v0.18.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/tools v0.25.0 h1:oFU9pkj/iJgs+0DT+VMHrx+oBKs/LJMV+Uvg78sl+fE= +golang.org/x/tools v0.25.0/go.mod h1:/vtpO8WL1N9cQC3FN5zPqb//fRXskFHbLKk4OW1Q7rg= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= +google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/fsnotify.v1 v1.4.7 h1:xOHLXZwVvI9hhs+cLKq5+I5onOuwQLhQwiu63xxlHs4= +gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= +gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= +gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= +gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +mvdan.cc/xurls/v2 v2.5.0 h1:lyBNOm8Wo71UknhUs4QTFUNNMyxy2JEIaKKo0RWOh+8= +mvdan.cc/xurls/v2 v2.5.0/go.mod h1:yQgaGQ1rFtJUzkmKiHYSSfuQxqfYmd//X6PxvholpeE= diff --git a/internal/pkg/crawl/api.go b/.old/internal/pkg/crawl/api.go similarity index 100% rename from internal/pkg/crawl/api.go rename to .old/internal/pkg/crawl/api.go diff --git a/internal/pkg/crawl/assets.go b/.old/internal/pkg/crawl/assets.go similarity index 66% rename from internal/pkg/crawl/assets.go rename to .old/internal/pkg/crawl/assets.go index 9aaa90eb..754602f4 100644 --- a/internal/pkg/crawl/assets.go +++ b/.old/internal/pkg/crawl/assets.go @@ -1,183 +1,178 @@ package crawl import ( - "io" - "net/http" "net/url" - "regexp" "strconv" "strings" - "sync/atomic" "github.com/PuerkitoBio/goquery" "github.com/internetarchive/Zeno/internal/pkg/crawl/extractor" "github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream" "github.com/internetarchive/Zeno/internal/pkg/queue" "github.com/internetarchive/Zeno/internal/pkg/utils" - "github.com/remeh/sizedwaitgroup" ) -var backgroundImageRegex = regexp.MustCompile(`(?:\(['"]?)(.*?)(?:['"]?\))`) -var urlRegex = regexp.MustCompile(`(?m)url\((.*?)\)`) - -func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie, headers map[string]string) error { - var resp *http.Response - - // Prepare GET request - req, err := http.NewRequest("GET", utils.URLToString(item.URL), nil) - if err != nil { - return err - } - - req.Header.Set("Referer", utils.URLToString(item.ParentURL)) - req.Header.Set("User-Agent", c.UserAgent) - - // If headers are passed, apply them to the request - if headers != nil { - for key, value := range headers { - req.Header.Set(key, value) - } - } - - // Apply cookies obtained from the original URL captured - for i := range cookies { - req.AddCookie(cookies[i]) - } - - resp, err = c.executeGET(item, req, false) - if err != nil && err.Error() == "URL from redirection has already been seen" { - return nil - } else if err != nil { - return err - } - defer resp.Body.Close() - - if extractor.IsM3U8(resp) { - assets, err := extractor.M3U8(resp) - if err == nil { - assets = c.seencheckAssets(assets, item) - if len(assets) != 0 { - c.captureAssets(item, assets, cookies, headers) - } - } else { - c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from M3U8") - } - } - - io.Copy(io.Discard, resp.Body) - - return nil -} - -func (c *Crawl) captureAssets(item *queue.Item, assets []*url.URL, cookies []*http.Cookie, headers map[string]string) { - // TODO: implement a counter for the number of assets - // currently being processed - // c.Frontier.QueueCount.Incr(int64(len(assets))) - swg := sizedwaitgroup.New(int(c.MaxConcurrentAssets)) - excluded := false - - for _, asset := range assets { - // TODO: implement a counter for the number of assets - // currently being processed - // c.Frontier.QueueCount.Incr(-1) - - // Just making sure we do not over archive by archiving the original URL - if utils.URLToString(item.URL) == utils.URLToString(asset) { - continue - } - - // If the URL match any excluded string, we ignore it - for _, excludedString := range c.ExcludedStrings { - if strings.Contains(utils.URLToString(asset), excludedString) { - excluded = true - break - } - } - - if excluded { - excluded = false - continue - } - - swg.Add() - c.URIsPerSecond.Incr(1) - - go func(asset *url.URL, swg *sizedwaitgroup.SizedWaitGroup) { - defer swg.Done() - - // Create the asset's item - newAsset, err := queue.NewItem(asset, item.URL, "asset", item.Hop, "", false) - if err != nil { - c.Log.WithFields(c.genLogFields(err, asset, map[string]interface{}{ - "parentHop": item.Hop, - "parentUrl": utils.URLToString(item.URL), - "type": "asset", - })).Error("error while creating asset item") - return - } - - // Capture the asset - err = c.captureAsset(newAsset, cookies, headers) - if err != nil { - c.Log.WithFields(c.genLogFields(err, &asset, map[string]interface{}{ - "parentHop": item.Hop, - "parentUrl": utils.URLToString(item.URL), - "type": "asset", - })).Error("error while capturing asset") - return - } - - // If we made it to this point, it means that the asset have been crawled successfully, - // then we can increment the locallyCrawled variable - atomic.AddUint64(&item.LocallyCrawled, 1) - }(asset, &swg) - } - - swg.Wait() -} - -func (c *Crawl) seencheckAssets(assets []*url.URL, item *queue.Item) []*url.URL { - if c.UseSeencheck { - if c.UseHQ { - seencheckedURLs, err := c.HQSeencheckURLs(assets) - // We ignore the error here because we don't want to slow down the crawl - // if HQ is down or if the request failed. So if we get an error, we just - // continue with the original list of assets. - if err != nil { - c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ - "urls": assets, - "parentHop": item.Hop, - "parentUrl": utils.URLToString(item.URL), - })).Error("error while seenchecking assets via HQ") - } else { - assets = seencheckedURLs - } - - if len(assets) == 0 { - return []*url.URL{} - } - } else { - seencheckedBatch := []*url.URL{} - - for _, URL := range assets { - found := c.Seencheck.SeencheckURL(utils.URLToString(URL), "asset") - if found { - continue - } - - seencheckedBatch = append(seencheckedBatch, URL) - } - - if len(seencheckedBatch) == 0 { - return []*url.URL{} - } - - assets = seencheckedBatch - } - } - - return assets -} +// var backgroundImageRegex = regexp.MustCompile(`(?:\(['"]?)(.*?)(?:['"]?\))`) +// var urlRegex = regexp.MustCompile(`(?m)url\((.*?)\)`) + +// func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie, headers map[string]string) error { +// var resp *http.Response + +// // Prepare GET request +// req, err := http.NewRequest("GET", utils.URLToString(item.URL), nil) +// if err != nil { +// return err +// } + +// req.Header.Set("Referer", utils.URLToString(item.ParentURL)) +// req.Header.Set("User-Agent", c.UserAgent) + +// // If headers are passed, apply them to the request +// if headers != nil { +// for key, value := range headers { +// req.Header.Set(key, value) +// } +// } + +// // Apply cookies obtained from the original URL captured +// for i := range cookies { +// req.AddCookie(cookies[i]) +// } + +// resp, err = c.executeGET(item, req, false) +// if err != nil && err.Error() == "URL from redirection has already been seen" { +// return nil +// } else if err != nil { +// return err +// } +// defer resp.Body.Close() + +// if extractor.IsM3U8(resp) { +// assets, err := extractor.M3U8(resp) +// if err == nil { +// assets = c.seencheckAssets(assets, item) +// if len(assets) != 0 { +// c.captureAssets(item, assets, cookies, headers) +// } +// } else { +// c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from M3U8") +// } +// } + +// io.Copy(io.Discard, resp.Body) + +// return nil +// } + +// func (c *Crawl) captureAssets(item *queue.Item, assets []*url.URL, cookies []*http.Cookie, headers map[string]string) { +// // TODO: implement a counter for the number of assets +// // currently being processed +// // c.Frontier.QueueCount.Incr(int64(len(assets))) +// swg := sizedwaitgroup.New(int(c.MaxConcurrentAssets)) +// excluded := false + +// for _, asset := range assets { +// // TODO: implement a counter for the number of assets +// // currently being processed +// // c.Frontier.QueueCount.Incr(-1) + +// // Just making sure we do not over archive by archiving the original URL +// if utils.URLToString(item.URL) == utils.URLToString(asset) { +// continue +// } + +// // If the URL match any excluded string, we ignore it +// for _, excludedString := range c.ExcludedStrings { +// if strings.Contains(utils.URLToString(asset), excludedString) { +// excluded = true +// break +// } +// } + +// if excluded { +// excluded = false +// continue +// } + +// swg.Add() +// c.URIsPerSecond.Incr(1) + +// go func(asset *url.URL, swg *sizedwaitgroup.SizedWaitGroup) { +// defer swg.Done() + +// // Create the asset's item +// newAsset, err := queue.NewItem(asset, item.URL, "asset", item.Hop, "", false) +// if err != nil { +// c.Log.WithFields(c.genLogFields(err, asset, map[string]interface{}{ +// "parentHop": item.Hop, +// "parentUrl": utils.URLToString(item.URL), +// "type": "asset", +// })).Error("error while creating asset item") +// return +// } + +// // Capture the asset +// err = c.captureAsset(newAsset, cookies, headers) +// if err != nil { +// c.Log.WithFields(c.genLogFields(err, &asset, map[string]interface{}{ +// "parentHop": item.Hop, +// "parentUrl": utils.URLToString(item.URL), +// "type": "asset", +// })).Error("error while capturing asset") +// return +// } + +// // If we made it to this point, it means that the asset have been crawled successfully, +// // then we can increment the locallyCrawled variable +// atomic.AddUint64(&item.LocallyCrawled, 1) +// }(asset, &swg) +// } + +// swg.Wait() +// } + +// func (c *Crawl) seencheckAssets(assets []*url.URL, item *queue.Item) []*url.URL { +// if c.UseSeencheck { +// if c.UseHQ { +// seencheckedURLs, err := c.HQSeencheckURLs(assets) +// // We ignore the error here because we don't want to slow down the crawl +// // if HQ is down or if the request failed. So if we get an error, we just +// // continue with the original list of assets. +// if err != nil { +// c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ +// "urls": assets, +// "parentHop": item.Hop, +// "parentUrl": utils.URLToString(item.URL), +// })).Error("error while seenchecking assets via HQ") +// } else { +// assets = seencheckedURLs +// } + +// if len(assets) == 0 { +// return []*url.URL{} +// } +// } else { +// seencheckedBatch := []*url.URL{} + +// for _, URL := range assets { +// found := c.Seencheck.SeencheckURL(utils.URLToString(URL), "asset") +// if found { +// continue +// } + +// seencheckedBatch = append(seencheckedBatch, URL) +// } + +// if len(seencheckedBatch) == 0 { +// return []*url.URL{} +// } + +// assets = seencheckedBatch +// } +// } + +// return assets +// } func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Document) (assets []*url.URL, err error) { var rawAssets []string diff --git a/internal/pkg/crawl/capture.go b/.old/internal/pkg/crawl/capture.go similarity index 95% rename from internal/pkg/crawl/capture.go rename to .old/internal/pkg/crawl/capture.go index 0e7308c8..6171cb1d 100644 --- a/internal/pkg/crawl/capture.go +++ b/.old/internal/pkg/crawl/capture.go @@ -35,24 +35,24 @@ func (c *Crawl) executeGET(item *queue.Item, req *http.Request, isRedirection bo URL *url.URL ) - defer func() { - if c.PrometheusMetrics != nil { - c.PrometheusMetrics.DownloadedURI.Inc() - } + // defer func() { + // if c.PrometheusMetrics != nil { + // c.PrometheusMetrics.DownloadedURI.Inc() + // } - c.URIsPerSecond.Incr(1) + // c.URIsPerSecond.Incr(1) - if item.Type == "seed" { - c.CrawledSeeds.Incr(1) - } else if item.Type == "asset" { - c.CrawledAssets.Incr(1) - } - }() + // if item.Type == "seed" { + // c.CrawledSeeds.Incr(1) + // } else if item.Type == "asset" { + // c.CrawledAssets.Incr(1) + // } + // }() - // Check if the crawl is paused - for c.Paused.Get() { - time.Sleep(time.Second) - } + // // Check if the crawl is paused + // for c.Paused.Get() { + // time.Sleep(time.Second) + // } // Retry on 429 error for retry := uint8(0); retry < c.MaxRetry; retry++ { @@ -414,26 +414,6 @@ func (c *Crawl) Capture(item *queue.Item) error { } return nil - } else if ina.IsAPIURL(req) { - rawAssets, err := ina.ExtractMedias(resp) - if err != nil { - c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract medias from INA") - } - - if len(rawAssets) != 0 { - assets = c.seencheckAssets(rawAssets, item) - - if len(assets) != 0 { - for _, asset := range rawAssets { - playerItem, err := queue.NewItem(asset, item.URL, "seed", 0, "", false) - if err != nil { - c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to create new item from asset") - } else { - c.Capture(playerItem) - } - } - } - } } // Scrape potential URLs from Link HTTP header diff --git a/internal/pkg/crawl/config.go b/.old/internal/pkg/crawl/config.go similarity index 98% rename from internal/pkg/crawl/config.go rename to .old/internal/pkg/crawl/config.go index a2bae436..06f80fc6 100644 --- a/internal/pkg/crawl/config.go +++ b/.old/internal/pkg/crawl/config.go @@ -116,9 +116,7 @@ type Crawl struct { HQStrategy string HQBatchConcurrency int HQBatchSize int - HQContinuousPull bool HQClient *gocrawlhq.Client - HQConsumerState string HQFinishedChannel chan *queue.Item HQProducerChannel chan *queue.Item HQChannelsWg *sync.WaitGroup @@ -321,7 +319,6 @@ func GenerateCrawlConfig(config *config.Config) (*Crawl, error) { c.HQStrategy = config.HQStrategy c.HQBatchSize = int(config.HQBatchSize) c.HQBatchConcurrency = config.HQBatchConcurrency - c.HQContinuousPull = config.HQContinuousPull c.HQRateLimitingSendBack = config.HQRateLimitSendBack // Handover mechanism diff --git a/internal/pkg/crawl/crawl.go b/.old/internal/pkg/crawl/crawl.go similarity index 98% rename from internal/pkg/crawl/crawl.go rename to .old/internal/pkg/crawl/crawl.go index 53fcff59..980e6665 100644 --- a/internal/pkg/crawl/crawl.go +++ b/.old/internal/pkg/crawl/crawl.go @@ -16,7 +16,7 @@ import ( "github.com/internetarchive/Zeno/internal/pkg/utils" "github.com/internetarchive/gocrawlhq" "github.com/prometheus/client_golang/prometheus" - "github.com/telanflow/cookiejar" + "github.com/ssgelm/cookiejarparser" "mvdan.cc/xurls/v2" ) @@ -173,7 +173,7 @@ func (c *Crawl) Start() (err error) { // Parse input cookie file if specified if c.CookieFile != "" { - cookieJar, err := cookiejar.NewFileJar(c.CookieFile, nil) + cookieJar, err := cookiejarparser.LoadCookieJarFile("cookies.txt") if err != nil { c.Log.WithFields(c.genLogFields(err, nil, nil)).Fatal("unable to parse cookie file") } diff --git a/internal/pkg/crawl/exclusion.go b/.old/internal/pkg/crawl/exclusion.go similarity index 100% rename from internal/pkg/crawl/exclusion.go rename to .old/internal/pkg/crawl/exclusion.go diff --git a/internal/pkg/crawl/extractor/json.go b/.old/internal/pkg/crawl/extractor/json.go similarity index 100% rename from internal/pkg/crawl/extractor/json.go rename to .old/internal/pkg/crawl/extractor/json.go diff --git a/internal/pkg/crawl/extractor/json_test.go b/.old/internal/pkg/crawl/extractor/json_test.go similarity index 100% rename from internal/pkg/crawl/extractor/json_test.go rename to .old/internal/pkg/crawl/extractor/json_test.go diff --git a/internal/pkg/crawl/extractor/m3u8.go b/.old/internal/pkg/crawl/extractor/m3u8.go similarity index 100% rename from internal/pkg/crawl/extractor/m3u8.go rename to .old/internal/pkg/crawl/extractor/m3u8.go diff --git a/internal/pkg/crawl/extractor/s3.go b/.old/internal/pkg/crawl/extractor/s3.go similarity index 100% rename from internal/pkg/crawl/extractor/s3.go rename to .old/internal/pkg/crawl/extractor/s3.go diff --git a/internal/pkg/crawl/extractor/utils.go b/.old/internal/pkg/crawl/extractor/utils.go similarity index 100% rename from internal/pkg/crawl/extractor/utils.go rename to .old/internal/pkg/crawl/extractor/utils.go diff --git a/internal/pkg/crawl/extractor/xml.go b/.old/internal/pkg/crawl/extractor/xml.go similarity index 100% rename from internal/pkg/crawl/extractor/xml.go rename to .old/internal/pkg/crawl/extractor/xml.go diff --git a/internal/pkg/crawl/extractor/xml_test.go b/.old/internal/pkg/crawl/extractor/xml_test.go similarity index 100% rename from internal/pkg/crawl/extractor/xml_test.go rename to .old/internal/pkg/crawl/extractor/xml_test.go diff --git a/internal/pkg/crawl/extractor/xml_test_sitemap.xml b/.old/internal/pkg/crawl/extractor/xml_test_sitemap.xml similarity index 100% rename from internal/pkg/crawl/extractor/xml_test_sitemap.xml rename to .old/internal/pkg/crawl/extractor/xml_test_sitemap.xml diff --git a/internal/pkg/crawl/finish.go b/.old/internal/pkg/crawl/finish.go similarity index 100% rename from internal/pkg/crawl/finish.go rename to .old/internal/pkg/crawl/finish.go diff --git a/internal/pkg/crawl/http_utils.go b/.old/internal/pkg/crawl/http_utils.go similarity index 100% rename from internal/pkg/crawl/http_utils.go rename to .old/internal/pkg/crawl/http_utils.go diff --git a/internal/pkg/crawl/log.go b/.old/internal/pkg/crawl/log.go similarity index 100% rename from internal/pkg/crawl/log.go rename to .old/internal/pkg/crawl/log.go diff --git a/internal/pkg/crawl/outlinks.go b/.old/internal/pkg/crawl/outlinks.go similarity index 100% rename from internal/pkg/crawl/outlinks.go rename to .old/internal/pkg/crawl/outlinks.go diff --git a/internal/pkg/crawl/stats.go b/.old/internal/pkg/crawl/stats.go similarity index 100% rename from internal/pkg/crawl/stats.go rename to .old/internal/pkg/crawl/stats.go diff --git a/internal/pkg/crawl/utils.go b/.old/internal/pkg/crawl/utils.go similarity index 92% rename from internal/pkg/crawl/utils.go rename to .old/internal/pkg/crawl/utils.go index d59434cd..8a7d718b 100644 --- a/internal/pkg/crawl/utils.go +++ b/.old/internal/pkg/crawl/utils.go @@ -82,12 +82,3 @@ func extractLinksFromText(source string) (links []*url.URL) { // func (c *Crawl) shouldPause(host string) bool { // return c.Frontier.GetActiveHostCount(host) >= c.MaxConcurrentRequestsPerDomain // } - -func isStatusCodeRedirect(statusCode int) bool { - if statusCode == 300 || statusCode == 301 || - statusCode == 302 || statusCode == 307 || - statusCode == 308 { - return true - } - return false -} diff --git a/internal/pkg/crawl/warc.go b/.old/internal/pkg/crawl/warc.go similarity index 100% rename from internal/pkg/crawl/warc.go rename to .old/internal/pkg/crawl/warc.go diff --git a/internal/pkg/crawl/worker.go b/.old/internal/pkg/crawl/worker.go similarity index 100% rename from internal/pkg/crawl/worker.go rename to .old/internal/pkg/crawl/worker.go diff --git a/internal/pkg/crawl/worker_pool.go b/.old/internal/pkg/crawl/worker_pool.go similarity index 100% rename from internal/pkg/crawl/worker_pool.go rename to .old/internal/pkg/crawl/worker_pool.go diff --git a/internal/pkg/log/elasticsearch.go b/.old/internal/pkg/log/elasticsearch.go similarity index 100% rename from internal/pkg/log/elasticsearch.go rename to .old/internal/pkg/log/elasticsearch.go diff --git a/internal/pkg/log/file.go b/.old/internal/pkg/log/file.go similarity index 100% rename from internal/pkg/log/file.go rename to .old/internal/pkg/log/file.go diff --git a/.old/internal/pkg/log/log.go b/.old/internal/pkg/log/log.go new file mode 100644 index 00000000..e5b56b61 --- /dev/null +++ b/.old/internal/pkg/log/log.go @@ -0,0 +1,262 @@ +// Package log provides a custom logging solution with multi-output support +// and log rotation for file output. +// ----------------------------------------------------------------------------- +// When Logger.{Debug, Info, Warn, Error, Fatal} is called, the log message is +// passed to all underlying handlers represented by Logger.handler +// Then multiHandler.Handle is called to pass the log message to all underlying handlers. +// ----------------------------------------------------------------------------- +// The rotation mechanism works by locking the logger, checking if it's time to rotate, +// and then calling the Rotate method on all rotatable handlers. +package log + +import ( + "context" + "fmt" + "log/slog" + "os" + "path/filepath" + "sync" + "sync/atomic" + "time" + + "github.com/elastic/go-elasticsearch/v8" +) + +var ( + isLoggerInit *atomic.Bool + storedLogger *Logger + once sync.Once +) + +// Logger wraps slog.Logger to provide multi-output functionality +type Logger struct { + sync.Mutex + handler *multiHandler + slogger *slog.Logger + stopRotation chan struct{} + stopErrorLog chan struct{} + errorChan chan error +} + +// Config holds the configuration for the logger +type Config struct { + FileConfig *LogfileConfig + FileLevel slog.Level + StdoutEnabled bool + StdoutLevel slog.Level + RotateLogFile bool + ElasticsearchConfig *ElasticsearchConfig + RotateElasticSearchIndex bool + isDefault bool +} + +// New creates a new Logger instance with the given configuration. +// It sets up handlers for stdout (text format) and file output (JSON format) if specified. +// If FileOutput is empty, only stdout logging will be enabled. +// Only the first call to New will store the logger to be reused. Subsequent calls will return a new logger instance. +// Only the first call to New will rotate the logs destinations. +// Please refrain from calling New multiple times in the same program. +// +// Parameters: +// - cfg: Config struct containing logger configuration options +// +// Returns: +// - *Logger: A new Logger instance +// - error: An error if there was a problem creating the logger (e.g., unable to open log file) +func New(cfg Config) (*Logger, error) { + var handlers []slog.Handler + + // Create stdout handler + if cfg.StdoutEnabled { + stdoutHandler := slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{ + Level: cfg.StdoutLevel, + }) + handlers = append(handlers, stdoutHandler) + } + + // Create file handler if FileOutput is specified + if cfg.FileConfig != nil { + // Create directories if they don't exist + err := os.MkdirAll(filepath.Dir(cfg.FileConfig.Filename()), 0755) + if err != nil { + return nil, err + } + + // Open log file + file, err := os.OpenFile(cfg.FileConfig.Filename(), os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) + if err != nil { + return nil, err + } + fileHandler := &fileHandler{ + Handler: slog.NewJSONHandler(file, &slog.HandlerOptions{Level: cfg.FileLevel}), + fileDescriptor: file, + rotationInterval: 6 * time.Hour, + lastRotation: time.Now(), + logfileConfig: cfg.FileConfig, + } + handlers = append(handlers, fileHandler) + } + + // Create Elasticsearch handler if ElasticsearchConfig is specified + if cfg.ElasticsearchConfig != nil { + esClient, err := elasticsearch.NewClient(elasticsearch.Config{ + Addresses: cfg.ElasticsearchConfig.Addresses, + Username: cfg.ElasticsearchConfig.Username, + Password: cfg.ElasticsearchConfig.Password, + }) + if err != nil { + return nil, fmt.Errorf("failed to create Elasticsearch client: %w", err) + } + esHandler := &ElasticsearchHandler{ + client: esClient, + index: fmt.Sprintf("%s-%s", cfg.ElasticsearchConfig.IndexPrefix, time.Now().Format("2006.01.02")), + level: cfg.ElasticsearchConfig.Level, + attrs: []slog.Attr{}, + groups: []string{}, + config: cfg.ElasticsearchConfig, + } + if err := esHandler.createIndex(); err != nil { + return nil, fmt.Errorf("failed to create Elasticsearch index: %w", err) + } + handlers = append(handlers, esHandler) + } + + // Create multi-handler + mh := &multiHandler{handlers: handlers} + + // Create slog.Logger + slogger := slog.New(mh) + + logger := &Logger{ + handler: mh, + slogger: slogger, + errorChan: make(chan error, 10), + stopErrorLog: make(chan struct{}), + } + + if !cfg.isDefault { + once.Do(func() { + isLoggerInit = new(atomic.Bool) + storedLogger = logger + isLoggerInit.CompareAndSwap(false, true) + + // Start rotation goroutine + logger.startRotation() + }) + } + + return logger, nil +} + +// DefaultOrStored returns the default Logger instance or if already initialized, the logger created by first call to New(). +// The default logger writes to both stdout (text format) and a file named "app.log" (JSON format). +// Both outputs are set to log messages at Info level and above. +// This function uses sync.Once to ensure that the default logger is only created once. +// +// Returns: +// - *Logger: The default Logger instance +// - bool: True if the logger was created by this function, false if the logger was already initialized +func DefaultOrStored() (*Logger, bool) { + var created = false + once.Do(func() { + isLoggerInit = new(atomic.Bool) + logger, err := New(Config{ + FileConfig: &LogfileConfig{Dir: "jobs", Prefix: "zeno"}, + FileLevel: slog.LevelInfo, + StdoutLevel: slog.LevelInfo, + isDefault: true, + }) + if err != nil { + panic(err) + } + storedLogger = logger + created = isLoggerInit.CompareAndSwap(false, true) + }) + return storedLogger, created +} + +// GetStoredLogger returns the logger created by the first call to New() or DefaultOrStored(). +// If the logger has not been initialized, it will return nil. +func GetStoredLogger() *Logger { + return storedLogger +} + +// Errors returns a channel that will receive logging errors +func (l *Logger) Errors() <-chan error { + return l.errorChan +} + +func (l *Logger) log(level slog.Level, msg string, args ...any) { + l.Lock() + defer l.Unlock() + + // Create a new Record with the message and args + r := slog.NewRecord(time.Now(), level, msg, 0) + r.Add(args...) + + err := l.handler.Handle(context.Background(), r) + if err != nil { + select { + case l.errorChan <- err: + default: + // If the error channel is full, log to stderr as a last resort + fmt.Fprintf(os.Stderr, "Logging error: %v\n", err) + } + } +} + +// Debug logs a message at Debug level. +// The first argument is the message to log, and subsequent arguments are key-value pairs +// that will be included in the log entry. +// +// Parameters: +// - msg: The message to log +// - args: Optional key-value pairs to include in the log entry +func (l *Logger) Debug(msg string, args ...any) { + l.log(slog.LevelDebug, msg, args...) +} + +// Info logs a message at Info level. +// The first argument is the message to log, and subsequent arguments are key-value pairs +// that will be included in the log entry. +// +// Parameters: +// - msg: The message to log +// - args: Optional key-value pairs to include in the log entry +func (l *Logger) Info(msg string, args ...any) { + l.log(slog.LevelInfo, msg, args...) +} + +// Warn logs a message at Warn level. +// The first argument is the message to log, and subsequent arguments are key-value pairs +// that will be included in the log entry. +// +// Parameters: +// - msg: The message to log +// - args: Optional key-value pairs to include in the log entry +func (l *Logger) Warn(msg string, args ...any) { + l.log(slog.LevelWarn, msg, args...) +} + +// Error logs a message at Error level. +// The first argument is the message to log, and subsequent arguments are key-value pairs +// that will be included in the log entry. +// +// Parameters: +// - msg: The message to log +// - args: Optional key-value pairs to include in the log entry +func (l *Logger) Error(msg string, args ...any) { + l.log(slog.LevelError, msg, args...) +} + +// Fatal logs a message at Error level and then calls os.Exit(1). +// The first argument is the message to log, and subsequent arguments are key-value pairs +// that will be included in the log entry. +// +// Parameters: +// - msg: The message to log +// - args: Optional key-value pairs to include in the log entry +func (l *Logger) Fatal(msg string, args ...any) { + l.log(slog.LevelError, msg, args...) + os.Exit(1) +} diff --git a/internal/pkg/log/misc.go b/.old/internal/pkg/log/misc.go similarity index 100% rename from internal/pkg/log/misc.go rename to .old/internal/pkg/log/misc.go diff --git a/internal/pkg/log/multi_handler.go b/.old/internal/pkg/log/multi_handler.go similarity index 100% rename from internal/pkg/log/multi_handler.go rename to .old/internal/pkg/log/multi_handler.go diff --git a/internal/pkg/log/rotate.go b/.old/internal/pkg/log/rotate.go similarity index 100% rename from internal/pkg/log/rotate.go rename to .old/internal/pkg/log/rotate.go diff --git a/internal/pkg/log/withfields.go b/.old/internal/pkg/log/withfields.go similarity index 100% rename from internal/pkg/log/withfields.go rename to .old/internal/pkg/log/withfields.go diff --git a/internal/pkg/log/writer.go b/.old/internal/pkg/log/writer.go similarity index 100% rename from internal/pkg/log/writer.go rename to .old/internal/pkg/log/writer.go diff --git a/internal/pkg/queue/access.go b/.old/internal/pkg/queue/access.go similarity index 100% rename from internal/pkg/queue/access.go rename to .old/internal/pkg/queue/access.go diff --git a/internal/pkg/queue/access_test.go b/.old/internal/pkg/queue/access_test.go similarity index 100% rename from internal/pkg/queue/access_test.go rename to .old/internal/pkg/queue/access_test.go diff --git a/internal/pkg/queue/dequeue.go b/.old/internal/pkg/queue/dequeue.go similarity index 100% rename from internal/pkg/queue/dequeue.go rename to .old/internal/pkg/queue/dequeue.go diff --git a/internal/pkg/queue/dequeue_test.go b/.old/internal/pkg/queue/dequeue_test.go similarity index 100% rename from internal/pkg/queue/dequeue_test.go rename to .old/internal/pkg/queue/dequeue_test.go diff --git a/internal/pkg/queue/encoding.go b/.old/internal/pkg/queue/encoding.go similarity index 100% rename from internal/pkg/queue/encoding.go rename to .old/internal/pkg/queue/encoding.go diff --git a/internal/pkg/queue/encoding_test.go b/.old/internal/pkg/queue/encoding_test.go similarity index 100% rename from internal/pkg/queue/encoding_test.go rename to .old/internal/pkg/queue/encoding_test.go diff --git a/internal/pkg/queue/enqueue.go b/.old/internal/pkg/queue/enqueue.go similarity index 100% rename from internal/pkg/queue/enqueue.go rename to .old/internal/pkg/queue/enqueue.go diff --git a/internal/pkg/queue/enqueue_test.go b/.old/internal/pkg/queue/enqueue_test.go similarity index 100% rename from internal/pkg/queue/enqueue_test.go rename to .old/internal/pkg/queue/enqueue_test.go diff --git a/internal/pkg/queue/error.go b/.old/internal/pkg/queue/error.go similarity index 100% rename from internal/pkg/queue/error.go rename to .old/internal/pkg/queue/error.go diff --git a/internal/pkg/queue/handover.go b/.old/internal/pkg/queue/handover.go similarity index 100% rename from internal/pkg/queue/handover.go rename to .old/internal/pkg/queue/handover.go diff --git a/internal/pkg/queue/handover_test.go b/.old/internal/pkg/queue/handover_test.go similarity index 100% rename from internal/pkg/queue/handover_test.go rename to .old/internal/pkg/queue/handover_test.go diff --git a/internal/pkg/queue/index/encoding.go b/.old/internal/pkg/queue/index/encoding.go similarity index 100% rename from internal/pkg/queue/index/encoding.go rename to .old/internal/pkg/queue/index/encoding.go diff --git a/internal/pkg/queue/index/error.go b/.old/internal/pkg/queue/index/error.go similarity index 100% rename from internal/pkg/queue/index/error.go rename to .old/internal/pkg/queue/index/error.go diff --git a/internal/pkg/queue/index/file_io.go b/.old/internal/pkg/queue/index/file_io.go similarity index 100% rename from internal/pkg/queue/index/file_io.go rename to .old/internal/pkg/queue/index/file_io.go diff --git a/internal/pkg/queue/index/index.go b/.old/internal/pkg/queue/index/index.go similarity index 100% rename from internal/pkg/queue/index/index.go rename to .old/internal/pkg/queue/index/index.go diff --git a/internal/pkg/queue/index/manager.go b/.old/internal/pkg/queue/index/manager.go similarity index 100% rename from internal/pkg/queue/index/manager.go rename to .old/internal/pkg/queue/index/manager.go diff --git a/internal/pkg/queue/index/manager_test.go b/.old/internal/pkg/queue/index/manager_test.go similarity index 100% rename from internal/pkg/queue/index/manager_test.go rename to .old/internal/pkg/queue/index/manager_test.go diff --git a/internal/pkg/queue/index/recovery.go b/.old/internal/pkg/queue/index/recovery.go similarity index 100% rename from internal/pkg/queue/index/recovery.go rename to .old/internal/pkg/queue/index/recovery.go diff --git a/internal/pkg/queue/index/recovery_test.go b/.old/internal/pkg/queue/index/recovery_test.go similarity index 100% rename from internal/pkg/queue/index/recovery_test.go rename to .old/internal/pkg/queue/index/recovery_test.go diff --git a/internal/pkg/queue/index/wal.go b/.old/internal/pkg/queue/index/wal.go similarity index 100% rename from internal/pkg/queue/index/wal.go rename to .old/internal/pkg/queue/index/wal.go diff --git a/internal/pkg/queue/index/wal_test.go b/.old/internal/pkg/queue/index/wal_test.go similarity index 100% rename from internal/pkg/queue/index/wal_test.go rename to .old/internal/pkg/queue/index/wal_test.go diff --git a/internal/pkg/queue/item.go b/.old/internal/pkg/queue/item.go similarity index 100% rename from internal/pkg/queue/item.go rename to .old/internal/pkg/queue/item.go diff --git a/internal/pkg/queue/item_test.go b/.old/internal/pkg/queue/item_test.go similarity index 100% rename from internal/pkg/queue/item_test.go rename to .old/internal/pkg/queue/item_test.go diff --git a/internal/pkg/queue/metadata.go b/.old/internal/pkg/queue/metadata.go similarity index 100% rename from internal/pkg/queue/metadata.go rename to .old/internal/pkg/queue/metadata.go diff --git a/internal/pkg/queue/protobuf/v1/item.pb.go b/.old/internal/pkg/queue/protobuf/v1/item.pb.go similarity index 100% rename from internal/pkg/queue/protobuf/v1/item.pb.go rename to .old/internal/pkg/queue/protobuf/v1/item.pb.go diff --git a/internal/pkg/queue/protobuf/v1/item.proto b/.old/internal/pkg/queue/protobuf/v1/item.proto similarity index 100% rename from internal/pkg/queue/protobuf/v1/item.proto rename to .old/internal/pkg/queue/protobuf/v1/item.proto diff --git a/internal/pkg/queue/queue.go b/.old/internal/pkg/queue/queue.go similarity index 100% rename from internal/pkg/queue/queue.go rename to .old/internal/pkg/queue/queue.go diff --git a/internal/pkg/queue/queue_test.go b/.old/internal/pkg/queue/queue_test.go similarity index 100% rename from internal/pkg/queue/queue_test.go rename to .old/internal/pkg/queue/queue_test.go diff --git a/internal/pkg/queue/stats.go b/.old/internal/pkg/queue/stats.go similarity index 100% rename from internal/pkg/queue/stats.go rename to .old/internal/pkg/queue/stats.go diff --git a/cmd/cmd.go b/cmd/cmd.go index 62cd5526..2d606cf4 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -3,7 +3,7 @@ package cmd import ( "fmt" - "github.com/internetarchive/Zeno/config" + "github.com/internetarchive/Zeno/internal/pkg/config" "github.com/spf13/cobra" ) @@ -27,7 +27,7 @@ Authors: return fmt.Errorf("error initializing config: %s", err) } - cfg = config.GetConfig() + cfg = config.Get() return nil }, Run: func(cmd *cobra.Command, args []string) { @@ -43,6 +43,7 @@ func Run() error { rootCmd.PersistentFlags().String("log-level", "info", "stdout log level (debug, info, warn, error)") rootCmd.PersistentFlags().String("config-file", "", "config file (default is $HOME/zeno-config.yaml)") rootCmd.PersistentFlags().Bool("no-stdout-log", false, "disable stdout logging.") + rootCmd.PersistentFlags().Bool("no-stderr-log", false, "disable stderr logging.") rootCmd.PersistentFlags().Bool("consul-config", false, "Use this flag to enable consul config support") rootCmd.PersistentFlags().String("consul-address", "", "The consul address used to retreive config") rootCmd.PersistentFlags().String("consul-path", "", "The full Consul K/V path where the config is stored") diff --git a/cmd/get.go b/cmd/get.go index fc273e2c..33893484 100644 --- a/cmd/get.go +++ b/cmd/get.go @@ -20,41 +20,39 @@ func getCMDs() *cobra.Command { getCmd.AddCommand(getURLCmd) getCmd.AddCommand(getHQCmd) - getCmd.AddCommand(getListCmd) return getCmd } func getCMDsFlags(getCmd *cobra.Command) { - getCmd.PersistentFlags().String("user-agent", "Zeno", "User agent to use when requesting URLs.") + getCmd.PersistentFlags().String("user-agent", "", "User agent to use when requesting URLs.") getCmd.PersistentFlags().String("job", "", "Job name to use, will determine the path for the persistent queue, seencheck database, and WARC files.") getCmd.PersistentFlags().IntP("workers", "w", 1, "Number of concurrent workers to run.") - getCmd.PersistentFlags().Int("max-concurrent-assets", 8, "Max number of concurrent assets to fetch PER worker. E.g. if you have 100 workers and this setting at 8, Zeno could do up to 800 concurrent requests at any time.") - getCmd.PersistentFlags().Uint8("max-hops", 0, "Maximum number of hops to execute.") + getCmd.PersistentFlags().Int("max-concurrent-assets", 1, "Max number of concurrent assets to fetch PER worker. E.g. if you have 100 workers and this setting at 8, Zeno could do up to 800 concurrent requests at any time.") + getCmd.PersistentFlags().Int("max-hops", 0, "Maximum number of hops to execute.") getCmd.PersistentFlags().String("cookies", "", "File containing cookies that will be used for requests.") getCmd.PersistentFlags().Bool("keep-cookies", false, "Keep a global cookie jar") getCmd.PersistentFlags().Bool("headless", false, "Use headless browsers instead of standard GET requests.") getCmd.PersistentFlags().Bool("disable-seencheck", false, "Disable the (remote or local) seencheck that avoid re-crawling of URIs.") getCmd.PersistentFlags().Bool("json", false, "Output logs in JSON") - getCmd.PersistentFlags().Bool("debug", false, "") getCmd.PersistentFlags().Bool("api", false, "Enable API") getCmd.PersistentFlags().String("api-port", "9443", "Port to listen on for the API.") getCmd.PersistentFlags().Bool("prometheus", false, "Export metrics in Prometheus format. (implies --api)") getCmd.PersistentFlags().String("prometheus-prefix", "zeno:", "String used as a prefix for the exported Prometheus metrics.") getCmd.PersistentFlags().Int("max-redirect", 20, "Specifies the maximum number of redirections to follow for a resource.") getCmd.PersistentFlags().Int("max-retry", 5, "Number of retry if error happen when executing HTTP request.") - getCmd.PersistentFlags().Int("http-timeout", -1, "Number of seconds to wait before timing out a request.") - getCmd.PersistentFlags().Bool("domains-crawl", false, "If this is turned on, seeds will be treated as domains to crawl, therefore same-domain outlinks will be added to the queue as hop=0.") + getCmd.PersistentFlags().Int("http-timeout", -1, "Number of seconds to wait before timing out a request. Note: this will CANCEL large files download.") + getCmd.PersistentFlags().Int("http-read-deadline", 60, "Number of seconds to wait before timing out a (blocking) read.") + getCmd.PersistentFlags().StringSlice("domains-crawl", []string{}, "Naive domains, full URLs or regexp to match against any URL to determine hop behaviour for outlinks. If an outlink URL is matched it will be queued to crawl with a hop of 0. This flag helps crawling entire domains while doing non-focused crawls.") getCmd.PersistentFlags().StringSlice("disable-html-tag", []string{}, "Specify HTML tag to not extract assets from") getCmd.PersistentFlags().Bool("capture-alternate-pages", false, "If turned on, HTML tags with \"alternate\" values for their \"rel\" attribute will be archived.") getCmd.PersistentFlags().StringSlice("exclude-host", []string{}, "Exclude a specific host from the crawl, note that it will not exclude the domain if it is encountered as an asset for another web page.") getCmd.PersistentFlags().StringSlice("include-host", []string{}, "Only crawl specific hosts, note that it will not include the domain if it is encountered as an asset for another web page.") getCmd.PersistentFlags().StringSlice("include-string", []string{}, "Only crawl URLs containing this string.") - getCmd.PersistentFlags().Int("max-concurrent-per-domain", 16, "Maximum number of concurrent requests per domain.") - getCmd.PersistentFlags().Int("concurrent-sleep-length", 500, "Number of milliseconds to sleep when max concurrency per domain is reached.") getCmd.PersistentFlags().Int("crawl-time-limit", 0, "Number of seconds until the crawl will automatically set itself into the finished state.") getCmd.PersistentFlags().Int("crawl-max-time-limit", 0, "Number of seconds until the crawl will automatically panic itself. Default to crawl-time-limit + (crawl-time-limit / 10)") getCmd.PersistentFlags().StringSlice("exclude-string", []string{}, "Discard any (discovered) URLs containing this string.") + getCmd.PersistentFlags().StringSlice("exclusion-file", []string{}, "File containing regex to apply on URLs for exclusion. If the path start with http or https, it will be treated as a URL of a file to download.") getCmd.PersistentFlags().Int("min-space-required", 20, "Minimum space required in GB to continue the crawl.") getCmd.PersistentFlags().Bool("handover", false, "Use the handover mechanism that dispatch URLs via a buffer before enqueuing on disk. (UNSTABLE)") getCmd.PersistentFlags().Bool("ultrasafe-queue", false, "Don't use committed batch writes to the WAL and instead fsync() after each write.") @@ -74,32 +72,43 @@ func getCMDsFlags(getCmd *cobra.Command) { getCmd.PersistentFlags().Bool("warc-on-disk", false, "Do not use RAM to store payloads when recording traffic to WARCs, everything will happen on disk (usually used to reduce memory usage).") getCmd.PersistentFlags().Int("warc-pool-size", 1, "Number of concurrent WARC files to write.") getCmd.PersistentFlags().String("warc-temp-dir", "", "Custom directory to use for WARC temporary files.") - getCmd.PersistentFlags().Bool("disable-local-dedupe", false, "Disable local URL agonistic deduplication.") + getCmd.PersistentFlags().Bool("disable-local-dedupe", false, "Disable local URL agnostic deduplication.") getCmd.PersistentFlags().Bool("cert-validation", false, "Enables certificate validation on HTTPS requests.") getCmd.PersistentFlags().Bool("disable-assets-capture", false, "Disable assets capture.") getCmd.PersistentFlags().Int("warc-dedupe-size", 1024, "Minimum size to deduplicate WARC records with revisit records.") - getCmd.PersistentFlags().String("cdx-cookie", "", "Pass custom cookie during CDX requests. Example: 'cdx_auth_token=test_value'") + getCmd.PersistentFlags().String("warc-cdx-cookie", "", "Pass custom cookie during CDX requests. Example: 'cdx_auth_token=test_value'") getCmd.PersistentFlags().Int("warc-size", 1024, "Size of the WARC files in MB.") // Logging flags - getCmd.PersistentFlags().Bool("live-stats", false, "Enable live stats but disable logging. (implies --no-stdout-log)") + getCmd.PersistentFlags().Bool("tui", false, "Display a terminal user interface.") + getCmd.PersistentFlags().String("tui-log-level", "info", "Log level for the TUI.") + getCmd.PersistentFlags().Bool("no-log-file", false, "Disable log file output.") getCmd.PersistentFlags().String("log-file-output-dir", "", "Directory to write log files to.") - getCmd.PersistentFlags().String("es-url", "", "comma-separated ElasticSearch URL to use for indexing crawl logs.") - getCmd.PersistentFlags().String("es-user", "", "ElasticSearch username to use for indexing crawl logs.") - getCmd.PersistentFlags().String("es-password", "", "ElasticSearch password to use for indexing crawl logs.") - getCmd.PersistentFlags().String("es-index-prefix", "zeno", "ElasticSearch index prefix to use for indexing crawl logs. Default is : `zeno`, without `-`") + getCmd.PersistentFlags().String("log-file-prefix", "ZENO", "Prefix to use when naming the log files. Default is : `ZENO`, without '-'") + getCmd.PersistentFlags().String("log-file-level", "info", "Log level for the log file.") + getCmd.PersistentFlags().String("log-file-rotation", "1h", "Log file rotation period. Default is : `1h`. Valid time units are 'ns', 'us' (or 'µs'), 'ms', 's', 'm', 'h'.") + getCmd.PersistentFlags().String("log-es-urls", "", "(Not implemented) Comma-separated ElasticSearch URL to use for indexing crawl logs.") + getCmd.PersistentFlags().String("log-es-user", "", "(Not implemented) ElasticSearch username to use for indexing crawl logs.") + getCmd.PersistentFlags().String("log-es-password", "", "(Not implemented) ElasticSearch password to use for indexing crawl logs.") + getCmd.PersistentFlags().String("log-es-index-prefix", "zeno", "(Not implemented) ElasticSearch index prefix to use for indexing crawl logs. Default is : `zeno`, without `-`") + getCmd.PersistentFlags().String("log-es-rotation", "1d", "(Not implemented) ElasticSearch index rotation period. Default is : `1d`. Valid time units are 'ns', 'us' (or 'µs'), 'ms', 's', 'm', 'h'.") + // log-level is defined in the root command + // no-stdout-log is defined in the root command // Dependencies flags getCmd.PersistentFlags().Bool("no-ytdlp", false, "Disable youtube-dlp usage for video extraction.") getCmd.PersistentFlags().String("ytdlp-path", "", "Path to youtube-dlp binary.") + // Profiling flags + getCmd.PersistentFlags().String("pyroscope-address", "", "Pyroscope server address. Setting this flag will enable profiling.") + // Alias support // As cobra doesn't support aliases natively (couldn't find a way to do it), we have to do it manually // This is a workaround to allow users to use `--hops` instead of `--max-hops` for example // Aliases shouldn't be used as proper flags nor declared in the config struct // Aliases should be marked as deprecated to inform the user base // Aliases values should be copied to the proper flag in the config/config.go:handleFlagsAliases() function - getCmd.PersistentFlags().Uint8("hops", 0, "Maximum number of hops to execute.") + getCmd.PersistentFlags().Int("hops", 0, "Maximum number of hops to execute.") getCmd.PersistentFlags().MarkDeprecated("hops", "use --max-hops instead") getCmd.PersistentFlags().MarkHidden("hops") diff --git a/cmd/get_hq.go b/cmd/get_hq.go index 3b3ad02d..2c6fe45f 100644 --- a/cmd/get_hq.go +++ b/cmd/get_hq.go @@ -2,46 +2,86 @@ package cmd import ( "fmt" + "os" + "runtime" + "time" - "github.com/internetarchive/Zeno/internal/pkg/crawl" + "github.com/google/uuid" + "github.com/grafana/pyroscope-go" + "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/internetarchive/Zeno/internal/pkg/controler" + "github.com/internetarchive/Zeno/internal/pkg/ui" + "github.com/internetarchive/Zeno/internal/pkg/utils" "github.com/spf13/cobra" ) var getHQCmd = &cobra.Command{ Use: "hq", Short: "Start crawling with the crawl HQ connector.", - PreRunE: func(cmd *cobra.Command, args []string) error { + PreRunE: func(_ *cobra.Command, _ []string) error { if cfg == nil { return fmt.Errorf("viper config is nil") } - cfg.HQ = true - return nil - }, - RunE: func(cmd *cobra.Command, args []string) error { - // Init crawl using the flags provided - crawl, err := crawl.GenerateCrawlConfig(cfg) + + err := config.GenerateCrawlConfig() if err != nil { - if crawl != nil && crawl.Log != nil { - crawl.Log.WithFields(map[string]interface{}{ - "crawl": crawl, - "err": err.Error(), - }).Error("'get hq' exited due to error") - } return err } - // start crawl - err = crawl.Start() - if err != nil { - crawl.Log.WithFields(map[string]interface{}{ - "crawl": crawl, - "err": err.Error(), - }).Error("'get hq' Crawl() exited due to error") - return err + cfg.UseHQ = true + + if cfg.PyroscopeAddress != "" { + runtime.SetMutexProfileFraction(5) + runtime.SetBlockProfileRate(5) + + // Get the hostname via env or via command + hostname, err := os.Hostname() + if err != nil { + return fmt.Errorf("error getting hostname for Pyroscope: %w", err) + } + + Version := utils.GetVersion() + + _, err = pyroscope.Start(pyroscope.Config{ + ApplicationName: fmt.Sprintf("zeno"), + ServerAddress: cfg.PyroscopeAddress, + Logger: nil, + Tags: map[string]string{"hostname": hostname, "job": cfg.Job, "version": Version.Version, "goVersion": Version.GoVersion, "uuid": uuid.New().String()[:5]}, + UploadRate: 15 * time.Second, + ProfileTypes: []pyroscope.ProfileType{ + pyroscope.ProfileCPU, + pyroscope.ProfileAllocObjects, + pyroscope.ProfileAllocSpace, + pyroscope.ProfileInuseObjects, + pyroscope.ProfileInuseSpace, + pyroscope.ProfileGoroutines, + pyroscope.ProfileMutexCount, + pyroscope.ProfileMutexDuration, + pyroscope.ProfileBlockCount, + pyroscope.ProfileBlockDuration, + }, + }) + + if err != nil { + panic(fmt.Errorf("error starting pyroscope: %w", err)) + } } return nil }, + RunE: func(_ *cobra.Command, _ []string) error { + controler.Start() + if config.Get().TUI { + tui := ui.New() + err := tui.Start() + if err != nil { + return fmt.Errorf("error starting TUI: %w", err) + } + } else { + controler.WatchSignals() + } + return nil + }, } func getHQCmdFlags(getHQCmd *cobra.Command) { @@ -52,8 +92,8 @@ func getHQCmdFlags(getHQCmd *cobra.Command) { getHQCmd.PersistentFlags().String("hq-project", "", "Crawl HQ project.") getHQCmd.PersistentFlags().Bool("hq-continuous-pull", false, "If turned on, the crawler will pull URLs from Crawl HQ continuously.") getHQCmd.PersistentFlags().String("hq-strategy", "lifo", "Crawl HQ feeding strategy.") - getHQCmd.PersistentFlags().Int64("hq-batch-size", 0, "Crawl HQ feeding batch size.") - getHQCmd.PersistentFlags().Int64("hq-batch-concurrency", 1, "Number of concurrent requests to do to get the --hq-batch-size, if batch size is 300 and batch-concurrency is 10, 30 requests will be done concurrently.") + getHQCmd.PersistentFlags().Int("hq-batch-size", 500, "Crawl HQ feeding batch size.") + getHQCmd.PersistentFlags().Int("hq-batch-concurrency", 1, "Number of concurrent requests to do to get the --hq-batch-size, if batch size is 300 and batch-concurrency is 10, 30 requests will be done concurrently.") getHQCmd.PersistentFlags().Bool("hq-rate-limiting-send-back", false, "If turned on, the crawler will send back URLs that hit a rate limit to crawl HQ.") getHQCmd.MarkPersistentFlagRequired("hq-address") diff --git a/cmd/get_list.go b/cmd/get_list.go deleted file mode 100644 index 1f935b7e..00000000 --- a/cmd/get_list.go +++ /dev/null @@ -1,61 +0,0 @@ -package cmd - -import ( - "fmt" - - "github.com/internetarchive/Zeno/internal/pkg/crawl" - "github.com/internetarchive/Zeno/internal/pkg/queue" - "github.com/spf13/cobra" -) - -var getListCmd = &cobra.Command{ - Use: "list [FILE]", - Short: "Start crawling with a seed list", - Args: cobra.ExactArgs(1), - PreRunE: func(cmd *cobra.Command, args []string) error { - if cfg == nil { - return fmt.Errorf("viper config is nil") - } - return nil - }, - RunE: func(cmd *cobra.Command, args []string) error { - // Init crawl using the flags provided - crawl, err := crawl.GenerateCrawlConfig(cfg) - if err != nil { - if crawl != nil && crawl.Log != nil { - crawl.Log.WithFields(map[string]interface{}{ - "crawl": crawl, - "err": err.Error(), - }).Error("'get hq' exited due to error") - } - return err - } - - // Initialize initial seed list - crawl.SeedList, err = queue.FileToItems(args[0]) - if err != nil || len(crawl.SeedList) <= 0 { - crawl.Log.WithFields(map[string]interface{}{ - "input": args[0], - "err": err.Error(), - }).Error("This is not a valid input") - return err - } - - crawl.Log.WithFields(map[string]interface{}{ - "input": args[0], - "seedsCount": len(crawl.SeedList), - }).Info("Seed list loaded") - - // Start crawl - err = crawl.Start() - if err != nil { - crawl.Log.WithFields(map[string]interface{}{ - "crawl": crawl, - "err": err.Error(), - }).Error("Crawl exited due to error") - return err - } - - return nil - }, -} diff --git a/cmd/get_url.go b/cmd/get_url.go index 7212cd68..1ed3202f 100644 --- a/cmd/get_url.go +++ b/cmd/get_url.go @@ -2,10 +2,9 @@ package cmd import ( "fmt" - "net/url" - "github.com/internetarchive/Zeno/internal/pkg/crawl" - "github.com/internetarchive/Zeno/internal/pkg/queue" + "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/internetarchive/Zeno/internal/pkg/controler" "github.com/spf13/cobra" ) @@ -13,58 +12,29 @@ var getURLCmd = &cobra.Command{ Use: "url [URL...]", Short: "Archive given URLs", Args: cobra.MinimumNArgs(1), - PreRunE: func(cmd *cobra.Command, args []string) error { + PreRunE: func(_ *cobra.Command, args []string) error { if cfg == nil { return fmt.Errorf("viper config is nil") } - return nil - }, - RunE: func(cmd *cobra.Command, args []string) error { - // Init crawl using the flags provided - crawl, err := crawl.GenerateCrawlConfig(cfg) - if err != nil { - if crawl != nil && crawl.Log != nil { - crawl.Log.WithFields(map[string]interface{}{ - "crawl": crawl, - "err": err.Error(), - }).Error("'get url' exited due to error") - } - return err - } - // Initialize initial seed list - for _, arg := range args { - input, err := url.Parse(arg) - if err != nil { - crawl.Log.WithFields(map[string]interface{}{ - "input_url": arg, - "err": err.Error(), - }).Error("given URL is not a valid input") - return err - } + if len(args) == 0 { + return fmt.Errorf("no URLs provided") + } - item, err := queue.NewItem(input, nil, "seed", 0, "", false) - if err != nil { - crawl.Log.WithFields(map[string]interface{}{ - "input_url": arg, - "err": err.Error(), - }).Error("Failed to create new item") - return err - } - crawl.SeedList = append(crawl.SeedList, *item) + return nil + }, + RunE: func(_ *cobra.Command, args []string) error { + for _, URL := range args { + config.Get().InputSeeds = append(config.Get().InputSeeds, URL) } - // Start crawl - err = crawl.Start() + err := config.GenerateCrawlConfig() if err != nil { - crawl.Log.WithFields(map[string]interface{}{ - "crawl": crawl, - "err": err.Error(), - }).Error("'get url' Crawl() exited due to error") return err } - crawl.Log.Info("Crawl finished") - return err + controler.Start() + controler.WatchSignals() + return nil }, } diff --git a/config/config.go b/config/config.go deleted file mode 100644 index 916d8fc2..00000000 --- a/config/config.go +++ /dev/null @@ -1,204 +0,0 @@ -package config - -import ( - "fmt" - "net/url" - "os" - "path/filepath" - "strings" - "sync" - - "github.com/spf13/pflag" - "github.com/spf13/viper" -) - -// Config holds all configuration for our program -type Config struct { - LogLevel string `mapstructure:"log-level"` - UserAgent string `mapstructure:"user-agent"` - Job string `mapstructure:"job"` - Cookies string `mapstructure:"cookies"` - APIPort string `mapstructure:"api-port"` - PrometheusPrefix string `mapstructure:"prometheus-prefix"` - WARCPrefix string `mapstructure:"warc-prefix"` - WARCOperator string `mapstructure:"warc-operator"` - CDXDedupeServer string `mapstructure:"warc-cdx-dedupe-server"` - WARCTempDir string `mapstructure:"warc-temp-dir"` - WARCSize int `mapstructure:"warc-size"` - CDXCookie string `mapstructure:"cdx-cookie"` - HQAddress string `mapstructure:"hq-address"` - HQKey string `mapstructure:"hq-key"` - HQSecret string `mapstructure:"hq-secret"` - HQProject string `mapstructure:"hq-project"` - HQStrategy string `mapstructure:"hq-strategy"` - HQBatchSize int64 `mapstructure:"hq-batch-size"` - HQBatchConcurrency int `mapstructure:"hq-batch-concurrency"` - LogFileOutputDir string `mapstructure:"log-file-output-dir"` - ElasticSearchUsername string `mapstructure:"es-user"` - ElasticSearchPassword string `mapstructure:"es-password"` - ElasticSearchIndexPrefix string `mapstructure:"es-index-prefix"` - DisableHTMLTag []string `mapstructure:"disable-html-tag"` - ExcludeHosts []string `mapstructure:"exclude-host"` - IncludeHosts []string `mapstructure:"include-host"` - IncludeString []string `mapstructure:"include-string"` - ExcludeString []string `mapstructure:"exclude-string"` - ElasticSearchURLs []string `mapstructure:"es-url"` - WorkersCount int `mapstructure:"workers"` - MaxConcurrentAssets int `mapstructure:"max-concurrent-assets"` - MaxHops uint8 `mapstructure:"max-hops"` - MaxRedirect uint8 `mapstructure:"max-redirect"` - MaxRetry uint8 `mapstructure:"max-retry"` - HTTPTimeout int `mapstructure:"http-timeout"` - MaxConcurrentRequestsPerDomain int `mapstructure:"max-concurrent-per-domain"` - ConcurrentSleepLength int `mapstructure:"concurrent-sleep-length"` - CrawlTimeLimit int `mapstructure:"crawl-time-limit"` - CrawlMaxTimeLimit int `mapstructure:"crawl-max-time-limit"` - MinSpaceRequired int `mapstructure:"min-space-required"` - WARCPoolSize int `mapstructure:"warc-pool-size"` - WARCDedupeSize int `mapstructure:"warc-dedupe-size"` - KeepCookies bool `mapstructure:"keep-cookies"` - Headless bool `mapstructure:"headless"` - DisableSeencheck bool `mapstructure:"disable-seencheck"` - JSON bool `mapstructure:"json"` - Debug bool `mapstructure:"debug"` - LiveStats bool `mapstructure:"live-stats"` - API bool `mapstructure:"api"` - Prometheus bool `mapstructure:"prometheus"` - DomainsCrawl bool `mapstructure:"domains-crawl"` - CaptureAlternatePages bool `mapstructure:"capture-alternate-pages"` - WARCOnDisk bool `mapstructure:"warc-on-disk"` - DisableLocalDedupe bool `mapstructure:"disable-local-dedupe"` - CertValidation bool `mapstructure:"cert-validation"` - DisableAssetsCapture bool `mapstructure:"disable-assets-capture"` - HQ bool // Special field to check if HQ is enabled depending on the command called - HQContinuousPull bool `mapstructure:"hq-continuous-pull"` - HQRateLimitSendBack bool `mapstructure:"hq-rate-limiting-send-back"` - NoStdoutLogging bool `mapstructure:"no-stdout-log"` - NoBatchWriteWAL bool `mapstructure:"ultrasafe-queue"` - Handover bool `mapstructure:"handover"` - - // Network - Proxy string `mapstructure:"proxy"` - DomainsBypassProxy []string `mapstructure:"bypass-proxy"` - RandomLocalIP bool `mapstructure:"random-local-ip"` - DisableIPv4 bool `mapstructure:"disable-ipv4"` - DisableIPv6 bool `mapstructure:"disable-ipv6"` - IPv6AnyIP bool `mapstructure:"ipv6-anyip"` - - // Dependencies - NoYTDLP bool `mapstructure:"no-ytdlp"` - YTDLPPath string `mapstructure:"ytdlp-path"` -} - -var ( - config *Config - once sync.Once -) - -// InitConfig initializes the configuration -// Flags -> Env -> Config file -> Consul config -// Latest has precedence over the rest -func InitConfig() error { - var err error - once.Do(func() { - config = &Config{} - - // Check if a config file is provided via flag - if configFile := viper.GetString("config-file"); configFile != "" { - viper.SetConfigFile(configFile) - } else { - home, err := os.UserHomeDir() - if err != nil { - fmt.Println(err) - os.Exit(1) - } - - viper.AddConfigPath(home) - viper.SetConfigType("yaml") - viper.SetConfigName("zeno-config") - } - - viper.SetEnvPrefix("ZENO") - replacer := strings.NewReplacer("-", "_", ".", "_") - viper.SetEnvKeyReplacer(replacer) - viper.AutomaticEnv() - - if err = viper.ReadInConfig(); err == nil { - fmt.Println("Using config file:", viper.ConfigFileUsed()) - } - - if viper.GetBool("consul-config") && viper.GetString("consul-address") != "" { - var consulAddress *url.URL - consulAddress, err = url.Parse(viper.GetString("consul-address")) - if err != nil { - return - } - - consulPath, consulFile := filepath.Split(viper.GetString("consul-path")) - viper.AddRemoteProvider("consul", consulAddress.String(), consulPath) - viper.SetConfigType(filepath.Ext(consulFile)) - viper.SetConfigName(strings.TrimSuffix(consulFile, filepath.Ext(consulFile))) - - if err = viper.ReadInConfig(); err == nil { - fmt.Println("Using config file:", viper.ConfigFileUsed()) - } - } - - // This function is used to bring logic to the flags when needed (e.g. live-stats) - handleFlagsEdgeCases() - - // This function is used to handle flags aliases (e.g. hops -> max-hops) - handleFlagsAliases() - - // Unmarshal the config into the Config struct - err = viper.Unmarshal(config) - }) - return err -} - -// BindFlags binds the flags to the viper configuration -// This is needed because viper doesn't support same flag name accross multiple commands -// Details here: https://github.com/spf13/viper/issues/375#issuecomment-794668149 -func BindFlags(flagSet *pflag.FlagSet) { - flagSet.VisitAll(func(flag *pflag.Flag) { - viper.BindPFlag(flag.Name, flag) - }) -} - -// GetConfig returns the config struct -func GetConfig() *Config { - cfg := config - if cfg == nil { - panic("Config not initialized. Call InitConfig() before accessing the config.") - } - return cfg -} - -func handleFlagsEdgeCases() { - if viper.GetBool("live-stats") { - // If live-stats is true, set no-stdout-log to true - viper.Set("no-stdout-log", true) - } - - if viper.GetBool("prometheus") { - // If prometheus is true, set no-stdout-log to true - viper.Set("api", true) - } -} - -func handleFlagsAliases() { - // For each flag we want to alias, we check if the original flag is at default and if the alias is not - // If so, we set the original flag to the value of the alias - - if viper.GetUint("hops") != 0 && viper.GetUint("max-hops") == 0 { - viper.Set("max-hops", viper.GetUint("hops")) - } - - if viper.GetInt("ca") != 8 && viper.GetInt("max-concurrent-assets") == 8 { - viper.Set("max-concurrent-assets", viper.GetInt("ca")) - } - - if viper.GetInt("msr") != 20 && viper.GetInt("min-space-required") == 20 { - viper.Set("min-space-required", viper.GetInt("msr")) - } -} diff --git a/go.mod b/go.mod index 1200c6fa..8830ce84 100644 --- a/go.mod +++ b/go.mod @@ -1,95 +1,80 @@ module github.com/internetarchive/Zeno -go 1.22.4 +go 1.24 + +toolchain go1.24.0 require ( - github.com/CorentinB/warc v0.8.53 - github.com/PuerkitoBio/goquery v1.9.3 - github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 - github.com/clbanning/mxj/v2 v2.7.0 + github.com/CorentinB/warc v0.8.71 + github.com/ImVexed/fasturl v0.0.0-20230304231329-4e41488060f3 + github.com/PuerkitoBio/goquery v1.10.1 + github.com/ada-url/goada v0.0.0-20250104020233-00cbf4dc9da1 + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc github.com/dustin/go-humanize v1.0.1 - github.com/elastic/go-elasticsearch/v8 v8.15.0 + github.com/gabriel-vasile/mimetype v1.4.8 + github.com/gdamore/tcell/v2 v2.8.1 github.com/google/uuid v1.6.0 - github.com/gosuri/uilive v0.0.4 - github.com/gosuri/uitable v0.0.4 - github.com/grafov/m3u8 v0.12.0 - github.com/internetarchive/gocrawlhq v1.2.20 - github.com/paulbellamy/ratecounter v0.2.0 + github.com/grafana/pyroscope-go v1.2.0 + github.com/grafov/m3u8 v0.12.1 + github.com/internetarchive/gocrawlhq v1.2.28 github.com/philippgille/gokv/leveldb v0.7.0 - github.com/prometheus/client_golang v1.20.4 - github.com/remeh/sizedwaitgroup v1.0.0 - github.com/sirupsen/logrus v1.9.3 - github.com/spf13/cobra v1.8.1 - github.com/spf13/pflag v1.0.5 + github.com/rivo/tview v0.0.0-20241227133733-17b7edb88c57 + github.com/samber/slog-multi v1.4.0 + github.com/spf13/cobra v1.9.1 + github.com/spf13/pflag v1.0.6 github.com/spf13/viper v1.19.0 - github.com/telanflow/cookiejar v0.0.0-20190719062046-114449e86aa5 go.uber.org/goleak v1.3.0 - golang.org/x/net v0.29.0 - google.golang.org/protobuf v1.34.2 - mvdan.cc/xurls/v2 v2.5.0 + golang.org/x/net v0.35.0 + mvdan.cc/xurls/v2 v2.6.0 ) require ( - github.com/andybalholm/brotli v1.1.0 // indirect - github.com/andybalholm/cascadia v1.3.2 // indirect - github.com/aws/aws-sdk-go v1.55.5 // indirect - github.com/beorn7/perks v1.0.1 // indirect - github.com/cespare/xxhash/v2 v2.3.0 // indirect - github.com/cloudflare/circl v1.4.0 // indirect - github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect - github.com/elastic/elastic-transport-go/v8 v8.6.0 // indirect - github.com/fatih/color v1.17.0 // indirect - github.com/fsnotify/fsnotify v1.7.0 // indirect - github.com/go-logr/logr v1.4.2 // indirect - github.com/go-logr/stdr v1.2.2 // indirect + github.com/andybalholm/brotli v1.1.1 // indirect + github.com/andybalholm/cascadia v1.3.3 // indirect + github.com/cloudflare/circl v1.6.0 // indirect + github.com/dolthub/maphash v0.1.0 // indirect + github.com/fsnotify/fsnotify v1.8.0 // indirect + github.com/gammazero/deque v1.0.0 // indirect + github.com/gdamore/encoding v1.0.1 // indirect github.com/gobwas/httphead v0.1.0 // indirect github.com/gobwas/pool v0.2.1 // indirect github.com/gobwas/ws v1.4.0 // indirect github.com/golang/snappy v0.0.4 // indirect - github.com/gomodule/redigo v1.9.2 // indirect - github.com/google/go-cmp v0.6.0 // indirect + github.com/grafana/pyroscope-go/godeltaprof v0.1.8 // indirect github.com/hashicorp/hcl v1.0.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect - github.com/jmespath/go-jmespath v0.4.0 // indirect - github.com/json-iterator/go v1.1.12 // indirect - github.com/klauspost/compress v1.17.10 // indirect - github.com/magiconair/properties v1.8.7 // indirect - github.com/mattn/go-colorable v0.1.13 // indirect - github.com/mattn/go-isatty v0.0.20 // indirect + github.com/klauspost/compress v1.18.0 // indirect + github.com/lucasb-eyer/go-colorful v1.2.0 // indirect + github.com/magiconair/properties v1.8.9 // indirect github.com/mattn/go-runewidth v0.0.16 // indirect - github.com/miekg/dns v1.1.62 // indirect + github.com/maypok86/otter v1.2.4 // indirect + github.com/miekg/dns v1.1.63 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect - github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect - github.com/modern-go/reflect2 v1.0.2 // indirect - github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/onsi/gomega v1.34.2 // indirect + github.com/paulbellamy/ratecounter v0.2.0 // indirect github.com/pelletier/go-toml/v2 v2.2.3 // indirect github.com/philippgille/gokv/encoding v0.7.0 // indirect github.com/philippgille/gokv/util v0.7.0 // indirect - github.com/prometheus/client_model v0.6.1 // indirect - github.com/prometheus/common v0.59.1 // indirect - github.com/prometheus/procfs v0.15.1 // indirect github.com/refraction-networking/utls v1.6.7 // indirect github.com/rivo/uniseg v0.4.7 // indirect - github.com/sagikazarmark/locafero v0.6.0 // indirect + github.com/sagikazarmark/locafero v0.7.0 // indirect github.com/sagikazarmark/slog-shim v0.1.0 // indirect + github.com/samber/lo v1.49.1 // indirect github.com/sourcegraph/conc v0.3.0 // indirect - github.com/spf13/afero v1.11.0 // indirect - github.com/spf13/cast v1.7.0 // indirect + github.com/spf13/afero v1.12.0 // indirect + github.com/spf13/cast v1.7.1 // indirect github.com/subosito/gotenv v1.6.0 // indirect github.com/syndtr/goleveldb v1.0.0 // indirect github.com/ulikunitz/xz v0.5.12 // indirect - go.opentelemetry.io/otel v1.30.0 // indirect - go.opentelemetry.io/otel/metric v1.30.0 // indirect - go.opentelemetry.io/otel/trace v1.30.0 // indirect go.uber.org/multierr v1.11.0 // indirect - golang.org/x/crypto v0.27.0 // indirect - golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 // indirect - golang.org/x/mod v0.21.0 // indirect - golang.org/x/sync v0.8.0 // indirect - golang.org/x/sys v0.26.0 // indirect - golang.org/x/text v0.18.0 // indirect - golang.org/x/tools v0.25.0 // indirect + golang.org/x/crypto v0.33.0 // indirect + golang.org/x/exp v0.0.0-20250106191152-7588d65b2ba8 // indirect + golang.org/x/mod v0.23.0 // indirect + golang.org/x/sync v0.11.0 // indirect + golang.org/x/sys v0.30.0 // indirect + golang.org/x/term v0.29.0 // indirect + golang.org/x/text v0.22.0 // indirect + golang.org/x/tools v0.30.0 // indirect gopkg.in/ini.v1 v1.67.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index a4f66821..b4a97ae3 100644 --- a/go.sum +++ b/go.sum @@ -1,54 +1,42 @@ -git.archive.org/wb/gocrawlhq v1.2.13 h1:PqEhgtYqNEUWO2JEJUHmXT+nIwW9LRgb4ocUFANciQo= -git.archive.org/wb/gocrawlhq v1.2.13/go.mod h1:JQIKgebFmpbxmEalNRjID3RwCxHkslt3PHAnum82KtM= -github.com/CorentinB/warc v0.8.52 h1:k6lkq3uh6PkhZG+WKpPEkeQPmO1byb7MnSZaNT28SH4= -github.com/CorentinB/warc v0.8.52/go.mod h1:NblONkMtoBB4TIigew6F6vakzu0z3YQTKNFS8U2FIn8= -github.com/CorentinB/warc v0.8.53 h1:xVz3RMdZ6faAqTtLfcK1/yl8ZTansy+B2en//EZLUlM= -github.com/CorentinB/warc v0.8.53/go.mod h1:NblONkMtoBB4TIigew6F6vakzu0z3YQTKNFS8U2FIn8= -github.com/PuerkitoBio/goquery v1.9.3 h1:mpJr/ikUA9/GNJB/DBZcGeFDXUtosHRyRrwh7KGdTG0= -github.com/PuerkitoBio/goquery v1.9.3/go.mod h1:1ndLHPdTz+DyQPICCWYlYQMPl0oXZj0G6D4LCYA6u4U= -github.com/PuerkitoBio/goquery v1.10.0 h1:6fiXdLuUvYs2OJSvNRqlNPoBm6YABE226xrbavY5Wv4= -github.com/PuerkitoBio/goquery v1.10.0/go.mod h1:TjZZl68Q3eGHNBA8CWaxAN7rOU1EbDz3CWuolcO5Yu4= -github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= -github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY= -github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= -github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= +github.com/CorentinB/warc v0.8.71 h1:b8RKjkoH0lkh9eU1RQirlh1Xh6dh9hs33WkDz6SLenM= +github.com/CorentinB/warc v0.8.71/go.mod h1:1fAGpKVIWnIuC79VdcrX0TyF00CHFzZ2VOYF7g5ivzA= +github.com/ImVexed/fasturl v0.0.0-20230304231329-4e41488060f3 h1:ClzzXMDDuUbWfNNZqGeYq4PnYOlwlOVIvSyNaIy0ykg= +github.com/ImVexed/fasturl v0.0.0-20230304231329-4e41488060f3/go.mod h1:we0YA5CsBbH5+/NUzC/AlMmxaDtWlXeNsqrwXjTzmzA= +github.com/PuerkitoBio/goquery v1.10.1 h1:Y8JGYUkXWTGRB6Ars3+j3kN0xg1YqqlwvdTV8WTFQcU= +github.com/PuerkitoBio/goquery v1.10.1/go.mod h1:IYiHrOMps66ag56LEH7QYDDupKXyo5A8qrjIx3ZtujY= +github.com/ada-url/goada v0.0.0-20250104020233-00cbf4dc9da1 h1:K54lYH7ZY/NHweMd9/R82dHaFelQQmwjEhUfwUqCqEk= +github.com/ada-url/goada v0.0.0-20250104020233-00cbf4dc9da1/go.mod h1:+D/veNwI2mA1hDYLVrYSobYcLFWm6e3DJ/H/d/dxlu8= +github.com/andybalholm/brotli v1.1.1 h1:PR2pgnyFznKEugtsUo0xLdDop5SKXd5Qf5ysW+7XdTA= +github.com/andybalholm/brotli v1.1.1/go.mod h1:05ib4cKhjx3OQYUY22hTVd34Bc8upXjOLL2rKwwZBoA= +github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= +github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= -github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 h1:DklsrG3dyBCFEj5IhUbnKptjxatkF07cF2ak3yi77so= -github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw= -github.com/aws/aws-sdk-go v1.55.5 h1:KKUZBfBoyqy5d3swXyiC7Q76ic40rYcbqH7qjh59kzU= -github.com/aws/aws-sdk-go v1.55.5/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU= -github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= -github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= -github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= -github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/clbanning/mxj/v2 v2.7.0 h1:WA/La7UGCanFe5NpHF0Q3DNtnCsVoxbPKuyBNHWRyME= -github.com/clbanning/mxj/v2 v2.7.0/go.mod h1:hNiWqW14h+kc+MdF9C6/YoRfjEJoR3ou6tn/Qo+ve2s= -github.com/cloudflare/circl v1.4.0 h1:BV7h5MgrktNzytKmWjpOtdYrf0lkkbF8YMlBGPhJQrY= -github.com/cloudflare/circl v1.4.0/go.mod h1:PDRU+oXvdD7KCtgKxW95M5Z8BpSCJXQORiZFnBQS5QU= -github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/cloudflare/circl v1.6.0 h1:cr5JKic4HI+LkINy2lg3W2jF8sHCVTBncJr5gIIq7qk= +github.com/cloudflare/circl v1.6.0/go.mod h1:uddAzsPgqdMAYatqJ0lsjX1oECcQLIlRpzZh3pJrofs= +github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dolthub/maphash v0.1.0 h1:bsQ7JsF4FkkWyrP3oCnFJgrCUAFbFf3kOl4L/QxPDyQ= +github.com/dolthub/maphash v0.1.0/go.mod h1:gkg4Ch4CdCDu5h6PMriVLawB7koZ+5ijb9puGMV50a4= github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= -github.com/elastic/elastic-transport-go/v8 v8.6.0 h1:Y2S/FBjx1LlCv5m6pWAF2kDJAHoSjSRSJCApolgfthA= -github.com/elastic/elastic-transport-go/v8 v8.6.0/go.mod h1:YLHer5cj0csTzNFXoNQ8qhtGY1GTvSqPnKWKaqQE3Hk= -github.com/elastic/go-elasticsearch/v8 v8.15.0 h1:IZyJhe7t7WI3NEFdcHnf6IJXqpRf+8S8QWLtZYYyBYk= -github.com/elastic/go-elasticsearch/v8 v8.15.0/go.mod h1:HCON3zj4btpqs2N1jjsAy4a/fiAul+YBP00mBH4xik8= -github.com/fatih/color v1.17.0 h1:GlRw1BRJxkpqUCBKzKOw098ed57fEsKeNjpTe3cSjK4= -github.com/fatih/color v1.17.0/go.mod h1:YZ7TlrGPkiz6ku9fK3TLD/pl3CpsiFyu8N92HLgmosI= +github.com/dvyukov/go-fuzz v0.0.0-20200318091601-be3528f3a813/go.mod h1:11Gm+ccJnvAhCNLlf5+cS9KjtbaD5I5zaZpFMsTHWTw= github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= -github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= -github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= -github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= -github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= -github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= -github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= -github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/fsnotify/fsnotify v1.8.0 h1:dAwr6QBTBZIkG8roQaJjGof0pp0EeF+tNV7YBP3F/8M= +github.com/fsnotify/fsnotify v1.8.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= +github.com/gabriel-vasile/mimetype v1.4.8 h1:FfZ3gj38NjllZIeJAmMhr+qKL8Wu+nOoI3GqacKw1NM= +github.com/gabriel-vasile/mimetype v1.4.8/go.mod h1:ByKUIKGjh1ODkGM1asKUbQZOLGrPjydw3hYPU2YU9t8= +github.com/gammazero/deque v1.0.0 h1:LTmimT8H7bXkkCy6gZX7zNLtkbz4NdS2z8LZuor3j34= +github.com/gammazero/deque v1.0.0/go.mod h1:iflpYvtGfM3U8S8j+sZEKIak3SAKYpA5/SQewgfXDKo= +github.com/gdamore/encoding v1.0.1 h1:YzKZckdBL6jVt2Gc+5p82qhrGiqMdG/eNs6Wy0u3Uhw= +github.com/gdamore/encoding v1.0.1/go.mod h1:0Z0cMFinngz9kS1QfMjCP8TY7em3bZYeeklsSDPivEo= +github.com/gdamore/tcell/v2 v2.8.1 h1:KPNxyqclpWpWQlPLx6Xui1pMk8S+7+R37h3g07997NU= +github.com/gdamore/tcell/v2 v2.8.1/go.mod h1:bj8ori1BG3OYMjmb3IklZVWfZUJ1UBQt9JXrOCOhGWw= github.com/go-test/deep v1.1.0 h1:WOcxcdHcvdgThNXjw0t76K42FXTU7HpNQWHpA2HHNlg= github.com/go-test/deep v1.1.0/go.mod h1:5C2ZWiW0ErCdrYzpqxLbTX7MG14M9iiw8DgHncVwcsE= github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU= @@ -61,74 +49,44 @@ github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5y github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= -github.com/gomodule/redigo v1.9.2 h1:HrutZBLhSIU8abiSfW8pj8mPhOyMYjZT/wcA4/L9L9s= -github.com/gomodule/redigo v1.9.2/go.mod h1:KsU3hiK/Ay8U42qpaJk+kuNa3C+spxapWpM+ywhcgtw= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/gosuri/uilive v0.0.4 h1:hUEBpQDj8D8jXgtCdBu7sWsy5sbW/5GhuO8KBwJ2jyY= -github.com/gosuri/uilive v0.0.4/go.mod h1:V/epo5LjjlDE5RJUcqx8dbw+zc93y5Ya3yg8tfZ74VI= -github.com/gosuri/uitable v0.0.4 h1:IG2xLKRvErL3uhY6e1BylFzG+aJiwQviDDTfOKeKTpY= -github.com/gosuri/uitable v0.0.4/go.mod h1:tKR86bXuXPZazfOTG1FIzvjIdXzd0mo4Vtn16vt0PJo= -github.com/grafov/m3u8 v0.12.0 h1:T6iTwTsSEtMcwkayef+FJO8kj+Sglr4Lh81Zj8Ked/4= -github.com/grafov/m3u8 v0.12.0/go.mod h1:nqzOkfBiZJENr52zTVd/Dcl03yzphIMbJqkXGu+u080= +github.com/grafana/pyroscope-go v1.2.0 h1:aILLKjTj8CS8f/24OPMGPewQSYlhmdQMBmol1d3KGj8= +github.com/grafana/pyroscope-go v1.2.0/go.mod h1:2GHr28Nr05bg2pElS+dDsc98f3JTUh2f6Fz1hWXrqwk= +github.com/grafana/pyroscope-go/godeltaprof v0.1.8 h1:iwOtYXeeVSAeYefJNaxDytgjKtUuKQbJqgAIjlnicKg= +github.com/grafana/pyroscope-go/godeltaprof v0.1.8/go.mod h1:2+l7K7twW49Ct4wFluZD3tZ6e0SjanjcUUBPVD/UuGU= +github.com/grafov/m3u8 v0.12.1 h1:DuP1uA1kvRRmGNAZ0m+ObLv1dvrfNO0TPx0c/enNk0s= +github.com/grafov/m3u8 v0.12.1/go.mod h1:nqzOkfBiZJENr52zTVd/Dcl03yzphIMbJqkXGu+u080= github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= -github.com/internetarchive/gocrawlhq v1.2.13 h1:ALfUrWR7nRez5gWhHRJ7ZklIpGMjERGMUJqR4HBl4+8= -github.com/internetarchive/gocrawlhq v1.2.13/go.mod h1:JQIKgebFmpbxmEalNRjID3RwCxHkslt3PHAnum82KtM= -github.com/internetarchive/gocrawlhq v1.2.14 h1:g3MPMonpA6mTkCpjBvW3paeBHiH+gGgwSvkyX/lxu7s= -github.com/internetarchive/gocrawlhq v1.2.14/go.mod h1:IOHVfWsptADzh+r2J+UnSm22EB9r8TiVVeAuP9WRFoc= -github.com/internetarchive/gocrawlhq v1.2.15 h1:Llv6tvxxRUxoC9G4GsjkpbfKX0anbQUU+pwFiROlxzg= -github.com/internetarchive/gocrawlhq v1.2.15/go.mod h1:Rjkyx2ttWDG4vzXOrl7ilzdtbODJ3XSe2PkO77bxSTs= -github.com/internetarchive/gocrawlhq v1.2.16 h1:D9JJdLL8uqpHUDU3SxxcXUjQETbxnk08e9xo929xrlE= -github.com/internetarchive/gocrawlhq v1.2.16/go.mod h1:Rjkyx2ttWDG4vzXOrl7ilzdtbODJ3XSe2PkO77bxSTs= -github.com/internetarchive/gocrawlhq v1.2.17 h1:nSjFHpDp5C9Q8SrDPibC4Iiih6kpw18+2GnifJiVpO0= -github.com/internetarchive/gocrawlhq v1.2.17/go.mod h1:Rjkyx2ttWDG4vzXOrl7ilzdtbODJ3XSe2PkO77bxSTs= -github.com/internetarchive/gocrawlhq v1.2.18 h1:PPe7UqJ2NNOljn70SmUhoKdgPreeqRUk9XVrYShCn4w= -github.com/internetarchive/gocrawlhq v1.2.18/go.mod h1:Rjkyx2ttWDG4vzXOrl7ilzdtbODJ3XSe2PkO77bxSTs= -github.com/internetarchive/gocrawlhq v1.2.19 h1:bvDliaeWjt97x64bOf+rKXStQX7VE+ZON/I1FS3sQ6A= -github.com/internetarchive/gocrawlhq v1.2.19/go.mod h1:gHrdMewIi5OBWE/xEZGqSrNHyTXPbt+h+XUWpp9fZek= -github.com/internetarchive/gocrawlhq v1.2.20 h1:0mIIt9lhPacKr6L2JeISoopQ8EgzC3dISJ3ITGGbOp4= -github.com/internetarchive/gocrawlhq v1.2.20/go.mod h1:gHrdMewIi5OBWE/xEZGqSrNHyTXPbt+h+XUWpp9fZek= -github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= -github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= -github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= -github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= -github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= -github.com/klauspost/compress v1.17.10 h1:oXAz+Vh0PMUvJczoi+flxpnBEPxoER1IaAnU/NMPtT0= -github.com/klauspost/compress v1.17.10/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0= +github.com/internetarchive/gocrawlhq v1.2.28 h1:R21OPHyAe2GhdR7D84PRju/cqQxT2fmlKLZlN1jY3xc= +github.com/internetarchive/gocrawlhq v1.2.28/go.mod h1:gHrdMewIi5OBWE/xEZGqSrNHyTXPbt+h+XUWpp9fZek= +github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= +github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= -github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= -github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= -github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= -github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= -github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= -github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= -github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= -github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY= +github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0= +github.com/magiconair/properties v1.8.9 h1:nWcCbLq1N2v/cpNsy5WvQ37Fb+YElfq20WJ/a8RkpQM= +github.com/magiconair/properties v1.8.9/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc= github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= -github.com/miekg/dns v1.1.62 h1:cN8OuEF1/x5Rq6Np+h1epln8OiyPWV+lROx9LxcGgIQ= -github.com/miekg/dns v1.1.62/go.mod h1:mvDlcItzm+br7MToIKqkglaGhlFMHJ9DTNNWONWXbNQ= +github.com/maypok86/otter v1.2.4 h1:HhW1Pq6VdJkmWwcZZq19BlEQkHtI8xgsQzBVXJU0nfc= +github.com/maypok86/otter v1.2.4/go.mod h1:mKLfoI7v1HOmQMwFgX4QkRk23mX6ge3RDvjdHOWG4R4= +github.com/miekg/dns v1.1.63 h1:8M5aAw6OMZfFXTT7K5V0Eu5YiiL8l7nUAkyN6C9YwaY= +github.com/miekg/dns v1.1.63/go.mod h1:6NGHfjhpmr5lt3XPLuyfDJi5AXbNIPM9PY6H6sF1Nfs= github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= -github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= -github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= -github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= -github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= -github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e h1:fD57ERR4JtEqsWbfPhv4DMiApHyliiK5xCTNVSPiaAs= +github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.7.0 h1:WSHQ+IS43OoUrWtD1/bbclrwK8TTH5hzp+umCiuxHgs= github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= @@ -152,130 +110,146 @@ github.com/philippgille/gokv/util v0.7.0/go.mod h1:i9KLHbPxGiHLMhkix/CcDQhpPbCkJ github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prometheus/client_golang v1.20.4 h1:Tgh3Yr67PaOv/uTqloMsCEdeuFTatm5zIq5+qNN23vI= -github.com/prometheus/client_golang v1.20.4/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= -github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= -github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= -github.com/prometheus/common v0.59.1 h1:LXb1quJHWm1P6wq/U824uxYi4Sg0oGvNeUm1z5dJoX0= -github.com/prometheus/common v0.59.1/go.mod h1:GpWM7dewqmVYcd7SmRaiWVe9SSqjf0UrwnYnpEZNuT0= -github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= -github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/refraction-networking/utls v1.6.7 h1:zVJ7sP1dJx/WtVuITug3qYUq034cDq9B2MR1K67ULZM= github.com/refraction-networking/utls v1.6.7/go.mod h1:BC3O4vQzye5hqpmDTWUqi4P5DDhzJfkV1tdqtawQIH0= -github.com/remeh/sizedwaitgroup v1.0.0 h1:VNGGFwNo/R5+MJBf6yrsr110p0m4/OX4S3DCy7Kyl5E= -github.com/remeh/sizedwaitgroup v1.0.0/go.mod h1:3j2R4OIe/SeS6YDhICBy22RWjJC5eNCJ1V+9+NVNYlo= +github.com/rivo/tview v0.0.0-20241227133733-17b7edb88c57 h1:LmsF7Fk5jyEDhJk0fYIqdWNuTxSyid2W42A0L2YWjGE= +github.com/rivo/tview v0.0.0-20241227133733-17b7edb88c57/go.mod h1:02iFIz7K/A9jGCvrizLPvoqr4cEIx7q54RH5Qudkrss= github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= +github.com/rivo/uniseg v0.4.3/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= -github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= -github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= +github.com/rogpeppe/go-internal v1.13.2-0.20241226121412-a5dc8ff20d0a h1:w3tdWGKbLGBPtR/8/oO74W6hmz0qE5q0z9aqSAewaaM= +github.com/rogpeppe/go-internal v1.13.2-0.20241226121412-a5dc8ff20d0a/go.mod h1:S8kfXMp+yh77OxPD4fdM6YUknrZpQxLhvxzS4gDHENY= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= -github.com/sagikazarmark/locafero v0.6.0 h1:ON7AQg37yzcRPU69mt7gwhFEBwxI6P9T4Qu3N51bwOk= -github.com/sagikazarmark/locafero v0.6.0/go.mod h1:77OmuIc6VTraTXKXIs/uvUxKGUXjE1GbemJYHqdNjX0= +github.com/sagikazarmark/locafero v0.7.0 h1:5MqpDsTGNDhY8sGp0Aowyf0qKsPrhewaLSsFaodPcyo= +github.com/sagikazarmark/locafero v0.7.0/go.mod h1:2za3Cg5rMaTMoG/2Ulr9AwtFaIppKXTRYnozin4aB5k= github.com/sagikazarmark/slog-shim v0.1.0 h1:diDBnUNK9N/354PgrxMywXnAwEr1QZcOr6gto+ugjYE= github.com/sagikazarmark/slog-shim v0.1.0/go.mod h1:SrcSrq8aKtyuqEI1uvTDTK1arOWRIczQRv+GVI1AkeQ= -github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= -github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/samber/lo v1.49.1 h1:4BIFyVfuQSEpluc7Fua+j1NolZHiEHEpaSEKdsH0tew= +github.com/samber/lo v1.49.1/go.mod h1:dO6KHFzUKXgP8LDhU0oI8d2hekjXnGOu0DB8Jecxd6o= +github.com/samber/slog-multi v1.4.0 h1:pwlPMIE7PrbTHQyKWDU+RIoxP1+HKTNOujk3/kdkbdg= +github.com/samber/slog-multi v1.4.0/go.mod h1:FsQ4Uv2L+E/8TZt+/BVgYZ1LoDWCbfCU21wVIoMMrO8= github.com/sourcegraph/conc v0.3.0 h1:OQTbbt6P72L20UqAkXXuLOj79LfEanQ+YQFNpLA9ySo= github.com/sourcegraph/conc v0.3.0/go.mod h1:Sdozi7LEKbFPqYX2/J+iBAM6HpqSLTASQIKqDmF7Mt0= -github.com/spf13/afero v1.11.0 h1:WJQKhtpdm3v2IzqG8VMqrr6Rf3UYpEF239Jy9wNepM8= -github.com/spf13/afero v1.11.0/go.mod h1:GH9Y3pIexgf1MTIWtNGyogA5MwRIDXGUr+hbWNoBjkY= -github.com/spf13/cast v1.7.0 h1:ntdiHjuueXFgm5nzDRdOS4yfT43P5Fnud6DH50rz/7w= -github.com/spf13/cast v1.7.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo= -github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= -github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= -github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= -github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/afero v1.12.0 h1:UcOPyRBYczmFn6yvphxkn9ZEOY65cpwGKb5mL36mrqs= +github.com/spf13/afero v1.12.0/go.mod h1:ZTlWwG4/ahT8W7T0WQ5uYmjI9duaLQGy3Q2OAl4sk/4= +github.com/spf13/cast v1.7.1 h1:cuNEagBQEHWN1FnbGEjCXL2szYEXqfJPbP2HNUaca9Y= +github.com/spf13/cast v1.7.1/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo= +github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo= +github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0= +github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o= +github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spf13/viper v1.19.0 h1:RWq5SEjt8o25SROyN3z2OrDB9l7RPd3lwTWU8EcEdcI= github.com/spf13/viper v1.19.0/go.mod h1:GQUN9bilAbhU/jgc1bKs99f/suXKeUMct8Adx5+Ntkg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= -github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= +github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= +github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8= github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU= github.com/syndtr/goleveldb v1.0.0 h1:fBdIW9lB4Iz0n9khmH8w27SJ3QEJ7+IgjPEwGSZiFdE= github.com/syndtr/goleveldb v1.0.0/go.mod h1:ZVVdQEZoIme9iO1Ch2Jdy24qqXrMMOU6lpPAyBWyWuQ= -github.com/telanflow/cookiejar v0.0.0-20190719062046-114449e86aa5 h1:gTQl5nPlc9B53vFOKM8aJHwxB2BW2kM49PVR5526GBg= -github.com/telanflow/cookiejar v0.0.0-20190719062046-114449e86aa5/go.mod h1:qNgA5MKwTh103SxGTooqZMiKxZTaV9UV3KjN7I7Drig= github.com/ulikunitz/xz v0.5.12 h1:37Nm15o69RwBkXM0J6A5OlE67RZTfzUxTj8fB3dfcsc= github.com/ulikunitz/xz v0.5.12/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14= +github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU= +github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= -go.opentelemetry.io/otel v1.30.0 h1:F2t8sK4qf1fAmY9ua4ohFS/K+FUuOPemHUIXHtktrts= -go.opentelemetry.io/otel v1.30.0/go.mod h1:tFw4Br9b7fOS+uEao81PJjVMjW/5fvNCbpsDIXqP0pc= -go.opentelemetry.io/otel/metric v1.30.0 h1:4xNulvn9gjzo4hjg+wzIKG7iNFEaBMX00Qd4QIZs7+w= -go.opentelemetry.io/otel/metric v1.30.0/go.mod h1:aXTfST94tswhWEb+5QjlSqG+cZlmyXy/u8jFpor3WqQ= -go.opentelemetry.io/otel/sdk v1.21.0 h1:FTt8qirL1EysG6sTQRZ5TokkU8d0ugCj8htOgThZXQ8= -go.opentelemetry.io/otel/sdk v1.21.0/go.mod h1:Nna6Yv7PWTdgJHVRD9hIYywQBRx7pbox6nwBnZIxl/E= -go.opentelemetry.io/otel/trace v1.30.0 h1:7UBkkYzeg3C7kQX8VAidWh2biiQbtAKjyIML8dQ9wmc= -go.opentelemetry.io/otel/trace v1.30.0/go.mod h1:5EyKqTzzmyqB9bwtCCq6pDLktPK6fmGf/Dph+8VI02o= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.27.0 h1:GXm2NjJrPaiv/h1tb2UH8QfgC/hOf/+z0p6PT8o1w7A= -golang.org/x/crypto v0.27.0/go.mod h1:1Xngt8kV6Dvbssa53Ziq6Eqn0HqbZi5Z6R0ZpwQzt70= -golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 h1:e66Fs6Z+fZTbFBAxKfP3PALWBtpfqks2bwGcexMxgtk= -golang.org/x/exp v0.0.0-20240909161429-701f63a606c0/go.mod h1:2TbTHSBQa924w8M6Xs1QcRcFwyucIwBGpK1p2f1YFFY= +golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= +golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= +golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= +golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= +golang.org/x/crypto v0.33.0 h1:IOBPskki6Lysi0lo9qQvbxiQ+FvsCC/YWOecCHAixus= +golang.org/x/crypto v0.33.0/go.mod h1:bVdXmD7IV/4GdElGPozy6U7lWdRXA4qyRVGJV57uQ5M= +golang.org/x/exp v0.0.0-20250106191152-7588d65b2ba8 h1:yqrTHse8TCMW1M1ZCP+VAR/l0kKxwaAIqN/il7x4voA= +golang.org/x/exp v0.0.0-20250106191152-7588d65b2ba8/go.mod h1:tujkw807nyEEAamNbDrEGzRav+ilXA7PCRAd6xsmwiU= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= -golang.org/x/mod v0.21.0 h1:vvrHzRwRfVKSiLrG+d4FMl/Qi4ukBCE6kZlTUkDYRT0= -golang.org/x/mod v0.21.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= +golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.23.0 h1:Zb7khfcRGKk+kqfxFaP5tZqCnDZMjC5VtUBs87Hr6QM= +golang.org/x/mod v0.23.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= -golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= -golang.org/x/net v0.29.0 h1:5ORfpBpCs4HzDYoodCDBbwHzdR5UrLBZ3sOnUJmFoHo= -golang.org/x/net v0.29.0/go.mod h1:gLkgy8jTGERgjzMic6DS9+SP0ajcu6Xu3Orq/SpETg0= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= +golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= +golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= +golang.org/x/net v0.35.0 h1:T5GQRQb2y08kTAByq9L4/bz8cipCdA8FbRTXewonqY8= +golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= -golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= +golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w= +golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34= -golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= -golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.29.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc= +golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= -golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= +golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= +golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= +golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= +golang.org/x/term v0.28.0/go.mod h1:Sw/lC2IAUZ92udQNf3WodGtn4k/XoLyZoh8v/8uiwek= +golang.org/x/term v0.29.0 h1:L6pJp37ocefwRRtYPKSWOWzOtWSxVajvz2ldH/xi3iU= +golang.org/x/term v0.29.0/go.mod h1:6bl4lRlvVuDgSf3179VpIxBF0o10JUpXWOnI7nErv7s= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= -golang.org/x/text v0.18.0 h1:XvMDiNzPAl0jr17s6W9lcaIhGUfUORdGCNsuLmPG224= -golang.org/x/text v0.18.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= +golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM= +golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= -golang.org/x/tools v0.25.0 h1:oFU9pkj/iJgs+0DT+VMHrx+oBKs/LJMV+Uvg78sl+fE= -golang.org/x/tools v0.25.0/go.mod h1:/vtpO8WL1N9cQC3FN5zPqb//fRXskFHbLKk4OW1Q7rg= +golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +golang.org/x/tools v0.30.0 h1:BgcpHewrV5AUp2G9MebG4XPFI1E2W41zU1SaqVA9vJY= +golang.org/x/tools v0.30.0/go.mod h1:c347cR/OJfw5TI+GfX7RUPNMdDRRbjvYTS0jPyvsVtY= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= -google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= -gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f h1:BLraFXnmrev5lT+xlilqcH8XK9/i0At2xKjWk4p6zsU= +gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/fsnotify.v1 v1.4.7 h1:xOHLXZwVvI9hhs+cLKq5+I5onOuwQLhQwiu63xxlHs4= gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= @@ -283,9 +257,8 @@ gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -mvdan.cc/xurls/v2 v2.5.0 h1:lyBNOm8Wo71UknhUs4QTFUNNMyxy2JEIaKKo0RWOh+8= -mvdan.cc/xurls/v2 v2.5.0/go.mod h1:yQgaGQ1rFtJUzkmKiHYSSfuQxqfYmd//X6PxvholpeE= +mvdan.cc/xurls/v2 v2.6.0 h1:3NTZpeTxYVWNSokW3MKeyVkz/j7uYXYiMtXRUfmjbgI= +mvdan.cc/xurls/v2 v2.6.0/go.mod h1:bCvEZ1XvdA6wDnxY7jPPjEmigDtvtvPXAD/Exa9IMSk= diff --git a/internal/pkg/archiver/archiver.go b/internal/pkg/archiver/archiver.go new file mode 100644 index 00000000..2c513dff --- /dev/null +++ b/internal/pkg/archiver/archiver.go @@ -0,0 +1,249 @@ +package archiver + +import ( + "context" + "fmt" + "net/http" + "strconv" + "sync" + "time" + + "github.com/CorentinB/warc" + "github.com/dustin/go-humanize" + "github.com/gabriel-vasile/mimetype" + "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/internetarchive/Zeno/internal/pkg/controler/pause" + "github.com/internetarchive/Zeno/internal/pkg/log" + "github.com/internetarchive/Zeno/internal/pkg/postprocessor/domainscrawl" + "github.com/internetarchive/Zeno/internal/pkg/stats" + "github.com/internetarchive/Zeno/pkg/models" +) + +func init() { + // We intentionally set the limit to 0 to disable the limit on the number of bytes the + // mimetype detection can accept. We limit the number of bytes that we will give to it + // in the processBody function instead. + mimetype.SetLimit(0) +} + +type archiver struct { + wg sync.WaitGroup + ctx context.Context + cancel context.CancelFunc + inputCh chan *models.Item + outputCh chan *models.Item + + Client *warc.CustomHTTPClient + ClientWithProxy *warc.CustomHTTPClient +} + +var ( + globalArchiver *archiver + once sync.Once + logger *log.FieldedLogger +) + +// Start initializes the internal archiver structure, start the WARC writer and start routines, should only be called once and returns an error if called more than once +func Start(inputChan, outputChan chan *models.Item) error { + var done bool + + log.Start() + logger = log.NewFieldedLogger(&log.Fields{ + "component": "archiver", + }) + + stats.Init() + + once.Do(func() { + ctx, cancel := context.WithCancel(context.Background()) + globalArchiver = &archiver{ + ctx: ctx, + cancel: cancel, + inputCh: inputChan, + outputCh: outputChan, + } + logger.Debug("initialized") + + // Setup WARC writing HTTP clients + startWARCWriter() + go watchWARCWritingQueue(250 * time.Millisecond) + + logger.Debug("WARC writer started") + + for i := 0; i < config.Get().WorkersCount; i++ { + globalArchiver.wg.Add(1) + go globalArchiver.worker(strconv.Itoa(i)) + } + + logger.Info("started") + done = true + }) + + if !done { + return ErrArchiverAlreadyInitialized + } + + return nil +} + +// Stop stops the archiver routines and the WARC writer +func Stop() { + if globalArchiver != nil { + globalArchiver.cancel() + globalArchiver.wg.Wait() + + // Wait for the WARC writing to finish + stopLocalWatcher := make(chan struct{}) + go func() { + for { + select { + case <-stopLocalWatcher: + return + case <-time.After(1 * time.Second): + logger.Debug("waiting for WARC writing to finish", "queue_size", GetWARCWritingQueueSize(), "bytes_written", humanize.Bytes(uint64(warc.DataTotal.Value()))) + } + } + }() + globalArchiver.Client.WaitGroup.Wait() + stopLocalWatcher <- struct{}{} + logger.Debug("WARC writing finished") + globalArchiver.Client.Close() + if globalArchiver.ClientWithProxy != nil { + globalArchiver.ClientWithProxy.WaitGroup.Wait() + globalArchiver.ClientWithProxy.Close() + } + + watchWARCWritingQueueCancel() + + logger.Info("stopped") + } +} + +func (a *archiver) worker(workerID string) { + defer a.wg.Done() + + logger := log.NewFieldedLogger(&log.Fields{ + "component": "archiver.worker", + "worker_id": workerID, + }) + + defer logger.Debug("worker stopped") + + // Subscribe to the pause controler + controlChans := pause.Subscribe() + defer pause.Unsubscribe(controlChans) + + stats.ArchiverRoutinesIncr() + defer stats.ArchiverRoutinesDecr() + + for { + select { + case <-a.ctx.Done(): + logger.Debug("shutting down") + return + case <-controlChans.PauseCh: + logger.Debug("received pause event") + controlChans.ResumeCh <- struct{}{} + logger.Debug("received resume event") + case seed, ok := <-a.inputCh: + if ok { + logger.Debug("received seed", "seed", seed.GetShortID(), "depth", seed.GetDepth(), "hops", seed.GetURL().GetHops()) + + if err := seed.CheckConsistency(); err != nil { + panic(fmt.Sprintf("seed consistency check failed with err: %s, seed id %s", err.Error(), seed.GetShortID())) + } + + if seed.GetStatus() != models.ItemPreProcessed && seed.GetStatus() != models.ItemGotRedirected && seed.GetStatus() != models.ItemGotChildren { + logger.Debug("skipping seed", "seed", seed.GetShortID(), "depth", seed.GetDepth(), "hops", seed.GetURL().GetHops(), "status", seed.GetStatus().String()) + } else { + archive(workerID, seed) + } + + select { + case <-a.ctx.Done(): + logger.Debug("aborting seed due to stop", "seed", seed.GetShortID(), "depth", seed.GetDepth(), "hops", seed.GetURL().GetHops()) + return + case a.outputCh <- seed: + } + } + } + } +} + +func archive(workerID string, seed *models.Item) { + // TODO: rate limiting handling + logger := log.NewFieldedLogger(&log.Fields{ + "component": "archiver.archive", + "worker_id": workerID, + }) + + var ( + guard = make(chan struct{}, config.Get().MaxConcurrentAssets) + wg sync.WaitGroup + ) + + items, err := seed.GetNodesAtLevel(seed.GetMaxDepth()) + if err != nil { + logger.Error("unable to get nodes at level", "err", err.Error(), "seed_id", seed.GetShortID()) + panic(err) + } + + for i := range items { + if items[i].GetStatus() != models.ItemPreProcessed { + logger.Debug("skipping item", "seed_id", seed.GetShortID(), "item_id", items[i].GetShortID(), "status", items[i].GetStatus().String(), "depth", items[i].GetDepth()) + continue + } + + guard <- struct{}{} + + wg.Add(1) + go func(item *models.Item) { + defer wg.Done() + defer func() { <-guard }() + defer stats.URLsCrawledIncr() + + var ( + err error + resp *http.Response + ) + + // Execute the request + req := item.GetURL().GetRequest() + if req == nil { + panic("request is nil") + } + if config.Get().Proxy != "" { + resp, err = globalArchiver.ClientWithProxy.Do(req) + } else { + resp, err = globalArchiver.Client.Do(req) + } + if err != nil { + logger.Error("unable to execute request", "err", err.Error(), "seed_id", seed.GetShortID(), "item_id", item.GetShortID(), "depth", item.GetDepth(), "hops", item.GetURL().GetHops()) + item.SetStatus(models.ItemFailed) + return + } + + // Set the response in the URL + item.GetURL().SetResponse(resp) + + // Process the body + err = ProcessBody(item.GetURL(), config.Get().DisableAssetsCapture, domainscrawl.Enabled(), config.Get().MaxHops, config.Get().WARCTempDir) + if err != nil { + logger.Error("unable to process body", "err", err.Error(), "item_id", item.GetShortID(), "seed_id", seed.GetShortID(), "depth", item.GetDepth(), "hops", item.GetURL().GetHops()) + item.SetStatus(models.ItemFailed) + return + } + + stats.HTTPReturnCodesIncr(strconv.Itoa(resp.StatusCode)) + + logger.Info("url archived", "url", item.GetURL().String(), "seed_id", seed.GetShortID(), "item_id", item.GetShortID(), "depth", item.GetDepth(), "hops", item.GetURL().GetHops(), "status", resp.StatusCode) + + item.SetStatus(models.ItemArchived) + }(items[i]) + } + + // Wait for all goroutines to finish + wg.Wait() + + return +} diff --git a/internal/pkg/archiver/body.go b/internal/pkg/archiver/body.go new file mode 100644 index 00000000..34910afc --- /dev/null +++ b/internal/pkg/archiver/body.go @@ -0,0 +1,125 @@ +package archiver + +import ( + "bytes" + "io" + "strings" + "time" + + "github.com/CorentinB/warc/pkg/spooledtempfile" + "github.com/gabriel-vasile/mimetype" + "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/internetarchive/Zeno/pkg/models" +) + +// ProcessBody processes the body of a URL response, loading it into memory or a temporary file +func ProcessBody(u *models.URL, disableAssetsCapture, domainsCrawl bool, maxHops int, WARCTempDir string) error { + defer u.GetResponse().Body.Close() // Ensure the response body is closed + + // Retrieve the underlying TCP connection and apply a 10s read deadline + conn, ok := u.GetResponse().Body.(interface{ SetReadDeadline(time.Time) error }) + if ok { + err := conn.SetReadDeadline(time.Now().Add(time.Duration(config.Get().HTTPReadDeadline))) + if err != nil { + return err + } + } + + // If we are not capturing assets, not extracting outlinks, and domains crawl is disabled + // we can just consume and discard the body + if disableAssetsCapture && !domainsCrawl && maxHops == 0 { + if err := copyWithTimeout(io.Discard, u.GetResponse().Body, conn); err != nil { + return err + } + } + + // Create a buffer to hold the body (first 2KB) + buffer := new(bytes.Buffer) + if err := copyWithTimeoutN(buffer, u.GetResponse().Body, 2048, conn); err != nil { + return err + } + + // Detect and set MIME type + u.SetMIMEType(mimetype.Detect(buffer.Bytes())) + + // Check if the MIME type requires post-processing + if (u.GetMIMEType().Parent() != nil && u.GetMIMEType().Parent().String() == "text/plain") || + strings.Contains(u.GetMIMEType().String(), "text/") { + + // Create a temp file with a 2MB memory buffer + spooledBuff := spooledtempfile.NewSpooledTempFile("zeno", WARCTempDir, 2097152, false, -1) + _, err := io.Copy(spooledBuff, buffer) + if err != nil { + closeErr := spooledBuff.Close() + if closeErr != nil { + panic(closeErr) + } + return err + } + + // Read the rest of the body into the spooled buffer + if err := copyWithTimeout(spooledBuff, u.GetResponse().Body, conn); err != nil { + closeErr := spooledBuff.Close() + if closeErr != nil { + panic(closeErr) + } + return err + } + + u.SetBody(spooledBuff) + u.RewindBody() + + return nil + } else { + // Read the rest of the body but discard it + if err := copyWithTimeout(io.Discard, u.GetResponse().Body, conn); err != nil { + return err + } + } + + return nil +} + +// copyWithTimeout copies data and resets the read deadline after each successful read +func copyWithTimeout(dst io.Writer, src io.Reader, conn interface{ SetReadDeadline(time.Time) error }) error { + buf := make([]byte, 4096) + for { + n, err := src.Read(buf) + if n > 0 { + // Reset the deadline after each successful read + if conn != nil { + err = conn.SetReadDeadline(time.Now().Add(time.Duration(config.Get().HTTPReadDeadline))) + if err != nil { + return err + } + } + if _, writeErr := dst.Write(buf[:n]); writeErr != nil { + return writeErr + } + } + if err != nil { + if err == io.EOF { + break + } + return err + } + } + return nil +} + +// copyWithTimeoutN copies a limited number of bytes and applies the timeout +func copyWithTimeoutN(dst io.Writer, src io.Reader, n int64, conn interface{ SetReadDeadline(time.Time) error }) error { + _, err := io.CopyN(dst, src, n) + if err != nil && err != io.EOF { + return err + } + + // Reset deadline after partial read + if conn != nil { + err = conn.SetReadDeadline(time.Now().Add(time.Duration(config.Get().HTTPReadDeadline))) + if err != nil { + return err + } + } + return nil +} diff --git a/internal/pkg/archiver/error.go b/internal/pkg/archiver/error.go new file mode 100644 index 00000000..d0af5b7b --- /dev/null +++ b/internal/pkg/archiver/error.go @@ -0,0 +1,8 @@ +package archiver + +import "errors" + +var ( + // ErrArchiverAlreadyInitialized is the error returned when the preprocess is already initialized + ErrArchiverAlreadyInitialized = errors.New("archiver already initialized") +) diff --git a/internal/pkg/archiver/warc.go b/internal/pkg/archiver/warc.go new file mode 100644 index 00000000..b38f5607 --- /dev/null +++ b/internal/pkg/archiver/warc.go @@ -0,0 +1,138 @@ +package archiver + +import ( + "context" + "os" + "path" + "time" + + "github.com/CorentinB/warc" + "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/internetarchive/Zeno/internal/pkg/log" + "github.com/internetarchive/Zeno/internal/pkg/stats" +) + +func startWARCWriter() { + // Configure WARC rotator settings + rotatorSettings := warc.NewRotatorSettings() + rotatorSettings.Prefix = config.Get().WARCPrefix + rotatorSettings.WARCWriterPoolSize = config.Get().WARCPoolSize + rotatorSettings.WarcSize = float64(config.Get().WARCSize) + rotatorSettings.OutputDirectory = path.Join(config.Get().JobPath, "warcs") + + // Configure WARC dedupe settings + dedupeOptions := warc.DedupeOptions{LocalDedupe: !config.Get().DisableLocalDedupe, SizeThreshold: config.Get().WARCDedupeSize} + if config.Get().CDXDedupeServer != "" { + dedupeOptions = warc.DedupeOptions{ + LocalDedupe: !config.Get().DisableLocalDedupe, + CDXDedupe: true, + CDXURL: config.Get().CDXDedupeServer, + CDXCookie: config.Get().CDXCookie, + SizeThreshold: config.Get().WARCDedupeSize, + } + } + + // Configure WARC settings + WARCSettings := warc.HTTPClientSettings{ + RotatorSettings: rotatorSettings, + DedupeOptions: dedupeOptions, + DecompressBody: true, + SkipHTTPStatusCodes: []int{429}, + VerifyCerts: config.Get().CertValidation, + TempDir: config.Get().WARCTempDir, + FullOnDisk: config.Get().WARCOnDisk, + RandomLocalIP: config.Get().RandomLocalIP, + DisableIPv4: config.Get().DisableIPv4, + DisableIPv6: config.Get().DisableIPv6, + IPv6AnyIP: config.Get().IPv6AnyIP, + } + + // Instantiate WARC client + var err error + if config.Get().Proxy != "" { + proxiedWARCSettings := WARCSettings + proxiedWARCSettings.Proxy = config.Get().Proxy + globalArchiver.ClientWithProxy, err = warc.NewWARCWritingHTTPClient(proxiedWARCSettings) + if err != nil { + logger.Error("unable to init proxied WARC HTTP client", "err", err.Error(), "func", "archiver.startWARCWriter") + os.Exit(1) + } + + go func() { + for err := range globalArchiver.ClientWithProxy.ErrChan { + logger.Error("WARC writer error", "err", err.Err.Error(), "func", err.Func) + } + }() + } + + // Even if a proxied client has been set, we want to create an non-proxied one + // if DomainsBypassProxy is used. The domains specified in this slice won't go + // through the proxied client, but through a "normal" client + if config.Get().Proxy == "" || len(config.Get().DomainsBypassProxy) > 0 { + globalArchiver.Client, err = warc.NewWARCWritingHTTPClient(WARCSettings) + if err != nil { + logger.Error("unable to init WARC HTTP client", "err", err.Error(), "func", "archiver.startWARCWriter") + os.Exit(1) + } + + go func() { + for err := range globalArchiver.Client.ErrChan { + logger.Error("WARC writer error", "err", err.Err.Error(), "func", err.Func) + } + }() + } + + // Set the timeouts + if config.Get().HTTPTimeout > 0 { + if globalArchiver.Client != nil { + globalArchiver.Client.Timeout = time.Duration(config.Get().HTTPTimeout) * time.Second + } + + if globalArchiver.ClientWithProxy != nil { + globalArchiver.ClientWithProxy.Timeout = time.Duration(config.Get().HTTPTimeout) * time.Second + } + } +} + +func GetClients() (clients []*warc.CustomHTTPClient) { + for _, c := range []*warc.CustomHTTPClient{globalArchiver.Client, globalArchiver.ClientWithProxy} { + if c != nil { + clients = append(clients, c) + } + } + + return clients +} + +func GetWARCWritingQueueSize() (total int) { + for _, c := range []*warc.CustomHTTPClient{globalArchiver.Client, globalArchiver.ClientWithProxy} { + if c != nil { + total += c.WaitGroup.Size() + } + } + + return total +} + +var ( + watchWARCWritingQueueContext, watchWARCWritingQueueCancel = context.WithCancel(context.Background()) +) + +func watchWARCWritingQueue(interval time.Duration) { + logger := log.NewFieldedLogger(&log.Fields{ + "component": "archiver.warcWritingQueueWatcher", + }) + + ticker := time.NewTicker(interval) + defer ticker.Stop() + + for { + select { + case <-watchWARCWritingQueueContext.Done(): + logger.Debug("closed") + return + case <-ticker.C: + stats.WarcWritingQueueSizeSet(int64(GetWARCWritingQueueSize())) + } + } +} diff --git a/internal/pkg/config/config.go b/internal/pkg/config/config.go new file mode 100644 index 00000000..448b1663 --- /dev/null +++ b/internal/pkg/config/config.go @@ -0,0 +1,393 @@ +package config + +import ( + "bufio" + "fmt" + "log/slog" + "net/http" + "net/url" + "os" + "path" + "path/filepath" + "regexp" + "strings" + "sync" + "time" + + "github.com/google/uuid" + "github.com/internetarchive/Zeno/internal/pkg/postprocessor/domainscrawl" + "github.com/internetarchive/Zeno/internal/pkg/utils" + "github.com/spf13/pflag" + "github.com/spf13/viper" +) + +// Config holds all configuration for our program, parsed from various sources +// The `mapstructure` tags are used to map the fields to the viper configuration +type Config struct { + Job string `mapstructure:"job"` + JobPath string + + // UseSeencheck exists just for convenience of not checking + // !DisableSeencheck in the rest of the code, to make the code clearer + DisableSeencheck bool `mapstructure:"disable-seencheck"` + UseSeencheck bool + + UserAgent string `mapstructure:"user-agent"` + Cookies string `mapstructure:"cookies"` + APIPort string `mapstructure:"api-port"` + PrometheusPrefix string `mapstructure:"prometheus-prefix"` + WARCPrefix string `mapstructure:"warc-prefix"` + WARCOperator string `mapstructure:"warc-operator"` + WARCTempDir string `mapstructure:"warc-temp-dir"` + WARCSize int `mapstructure:"warc-size"` + WARCOnDisk bool `mapstructure:"warc-on-disk"` + WARCPoolSize int `mapstructure:"warc-pool-size"` + WARCDedupeSize int `mapstructure:"warc-dedupe-size"` + CDXDedupeServer string `mapstructure:"warc-cdx-dedupe-server"` + CDXCookie string `mapstructure:"warc-cdx-cookie"` + HQAddress string `mapstructure:"hq-address"` + HQKey string `mapstructure:"hq-key"` + HQSecret string `mapstructure:"hq-secret"` + HQProject string `mapstructure:"hq-project"` + HQStrategy string `mapstructure:"hq-strategy"` + HQBatchSize int `mapstructure:"hq-batch-size"` + HQBatchConcurrency int `mapstructure:"hq-batch-concurrency"` + DisableHTMLTag []string `mapstructure:"disable-html-tag"` + ExcludeHosts []string `mapstructure:"exclude-host"` + IncludeHosts []string `mapstructure:"include-host"` + IncludeString []string `mapstructure:"include-string"` + ExcludeString []string `mapstructure:"exclude-string"` + ExclusionFile []string `mapstructure:"exclusion-file"` + WorkersCount int `mapstructure:"workers"` + MaxConcurrentAssets int `mapstructure:"max-concurrent-assets"` + MaxHops int `mapstructure:"max-hops"` + MaxRedirect int `mapstructure:"max-redirect"` + MaxRetry int `mapstructure:"max-retry"` + HTTPTimeout int `mapstructure:"http-timeout"` + HTTPReadDeadline int `mapstructure:"http-read-deadline"` + CrawlTimeLimit int `mapstructure:"crawl-time-limit"` + CrawlMaxTimeLimit int `mapstructure:"crawl-max-time-limit"` + MinSpaceRequired int `mapstructure:"min-space-required"` + KeepCookies bool `mapstructure:"keep-cookies"` + Headless bool `mapstructure:"headless"` + JSON bool `mapstructure:"json"` + API bool `mapstructure:"api"` + Prometheus bool `mapstructure:"prometheus"` + DomainsCrawl []string `mapstructure:"domains-crawl"` + CaptureAlternatePages bool `mapstructure:"capture-alternate-pages"` + DisableLocalDedupe bool `mapstructure:"disable-local-dedupe"` + CertValidation bool `mapstructure:"cert-validation"` + DisableAssetsCapture bool `mapstructure:"disable-assets-capture"` + UseHQ bool // Special field to check if HQ is enabled depending on the command called + HQRateLimitingSendBack bool `mapstructure:"hq-rate-limiting-send-back"` + NoBatchWriteWAL bool `mapstructure:"ultrasafe-queue"` + Handover bool `mapstructure:"handover"` + + // Network + Proxy string `mapstructure:"proxy"` + DomainsBypassProxy []string `mapstructure:"bypass-proxy"` + RandomLocalIP bool `mapstructure:"random-local-ip"` + DisableIPv4 bool `mapstructure:"disable-ipv4"` + DisableIPv6 bool `mapstructure:"disable-ipv6"` + IPv6AnyIP bool `mapstructure:"ipv6-anyip"` + + // Dependencies + NoYTDLP bool `mapstructure:"no-ytdlp"` + YTDLPPath string `mapstructure:"ytdlp-path"` + + // Logging + NoStdoutLogging bool `mapstructure:"no-stdout-log"` + NoStderrLogging bool `mapstructure:"no-stderr-log"` + NoFileLogging bool `mapstructure:"no-log-file"` + StdoutLogLevel string `mapstructure:"log-level"` + TUI bool `mapstructure:"tui"` + TUILogLevel string `mapstructure:"tui-log-level"` + LogFileLevel string `mapstructure:"log-file-level"` + LogFileOutputDir string `mapstructure:"log-file-output-dir"` + LogFilePrefix string `mapstructure:"log-file-prefix"` + LogFileRotation string `mapstructure:"log-file-rotation"` + ElasticSearchURLs string `mapstructure:"log-es-urls"` + ElasticSearchUsername string `mapstructure:"log-es-user"` + ElasticSearchPassword string `mapstructure:"log-es-password"` + ElasticSearchLogLevel string `mapstructure:"log-es-log-level"` + ElasticSearchIndexPrefix string `mapstructure:"log-es-index-prefix"` + ElasticSearchRotation string `mapstructure:"log-es-rotation"` + + // Profiling + PyroscopeAddress string `mapstructure:"pyroscope-address"` + + InputSeeds []string // Special field to store the input URLs + ExclusionRegexes []*regexp.Regexp // Special field to store the compiled exclusion regex (from --exclusion-file) +} + +var ( + config *Config + once sync.Once +) + +// InitConfig initializes the configuration +// Flags -> Env -> Config file -> Consul config +// Latest has precedence over the rest +func InitConfig() error { + var err error + once.Do(func() { + config = &Config{} + + // Check if a config file is provided via flag + if configFile := viper.GetString("config-file"); configFile != "" { + viper.SetConfigFile(configFile) + } else { + home, err := os.UserHomeDir() + if err != nil { + fmt.Println(err) + os.Exit(1) + } + + viper.AddConfigPath(home) + viper.SetConfigType("yaml") + viper.SetConfigName("zeno-config") + } + + viper.SetEnvPrefix("ZENO") + replacer := strings.NewReplacer("-", "_", ".", "_") + viper.SetEnvKeyReplacer(replacer) + viper.AutomaticEnv() + + if err = viper.ReadInConfig(); err == nil { + fmt.Println("Using config file:", viper.ConfigFileUsed()) + } + + if viper.GetBool("consul-config") && viper.GetString("consul-address") != "" { + var consulAddress *url.URL + consulAddress, err = url.Parse(viper.GetString("consul-address")) + if err != nil { + return + } + + consulPath, consulFile := filepath.Split(viper.GetString("consul-path")) + viper.AddRemoteProvider("consul", consulAddress.String(), consulPath) + viper.SetConfigType(filepath.Ext(consulFile)) + viper.SetConfigName(strings.TrimSuffix(consulFile, filepath.Ext(consulFile))) + + if err = viper.ReadInConfig(); err == nil { + fmt.Println("Using config file:", viper.ConfigFileUsed()) + } + } + + // This function is used to bring logic to the flags when needed (e.g. live-stats) + handleFlagsEdgeCases() + + // This function is used to handle flags aliases (e.g. hops -> max-hops) + handleFlagsAliases() + + // Unmarshal the config into the Config struct + err = viper.Unmarshal(config) + }) + return err +} + +// BindFlags binds the flags to the viper configuration +// This is needed because viper doesn't support same flag name accross multiple commands +// Details here: https://github.com/spf13/viper/issues/375#issuecomment-794668149 +func BindFlags(flagSet *pflag.FlagSet) { + flagSet.VisitAll(func(flag *pflag.Flag) { + viper.BindPFlag(flag.Name, flag) + }) +} + +// Get returns the config struct +func Get() *Config { + return config +} + +func GenerateCrawlConfig() error { + // If the job name isn't specified, we generate a random name + if config.Job == "" { + if config.HQProject != "" { + config.Job = config.HQProject + } else { + UUID, err := uuid.NewUUID() + if err != nil { + slog.Error("cmd/utils.go:InitCrawlWithCMD():uuid.NewUUID()", "error", err) + return err + } + + config.Job = UUID.String() + } + } + + config.JobPath = path.Join("jobs", config.Job) + config.UseSeencheck = !config.DisableSeencheck + + // Defaults --max-crawl-time-limit to 10% more than --crawl-time-limit + if config.CrawlMaxTimeLimit == 0 && config.CrawlTimeLimit != 0 { + config.CrawlMaxTimeLimit = config.CrawlTimeLimit + (config.CrawlTimeLimit / 10) + } + + // We exclude some hosts by default + config.ExcludeHosts = utils.DedupeStrings(append(config.ExcludeHosts, "archive.org", "archive-it.org")) + + if config.WARCTempDir == "" { + config.WARCTempDir = path.Join(config.JobPath, "temp") + } + + if config.UserAgent == "" { + version := utils.GetVersion() + + // If Version is a commit hash, we only take the first 7 characters + if len(version.Version) >= 40 { + version.Version = version.Version[:7] + } + + config.UserAgent = "Mozilla/5.0 (compatible; archive.org_bot +http://archive.org/details/archive.org_bot) Zeno/" + version.Version + " warc/" + version.WarcVersion + slog.Info("User-Agent set to", "user-agent", config.UserAgent) + } + + if config.RandomLocalIP { + slog.Warn("Random local IP is enabled") + } + + if config.DisableIPv4 && config.DisableIPv6 { + slog.Error("Both IPv4 and IPv6 are disabled, at least one of them must be enabled.") + os.Exit(1) + } else if config.DisableIPv4 { + slog.Info("IPv4 is disabled") + } else if config.DisableIPv6 { + slog.Info("IPv6 is disabled") + } + + if len(config.ExclusionFile) > 0 { + for _, file := range config.ExclusionFile { + var ( + regexes []string + err error + ) + + if strings.HasPrefix(file, "http://") || strings.HasPrefix(file, "https://") { + slog.Info("Reading (remote) exclusion file", "file", file) + regexes, err = readRemoteExclusionFile(file) + if err != nil { + return err + } + } else { + slog.Info("Reading (local) exclusion file", "file", file) + regexes, err = readLocalExclusionFile(file) + if err != nil { + return err + } + } + + slog.Info("Compiling exclusion regexes", "regexes", len(regexes)) + compiledRegexes := compileRegexes(regexes) + + config.ExclusionRegexes = append(config.ExclusionRegexes, compiledRegexes...) + } + } + + if len(config.DomainsCrawl) > 0 { + slog.Info("Domains crawl enabled", "domains/regex", config.DomainsCrawl) + err := domainscrawl.AddElements(config.DomainsCrawl) + if err != nil { + panic(err) + } + } + + return nil +} + +func compileRegexes(regexes []string) []*regexp.Regexp { + var compiledRegexes []*regexp.Regexp + + for _, regex := range regexes { + slog.Debug("Compiling regex", "regex", regex) + compiledRegex := regexp.MustCompile(regex) + + compiledRegexes = append(compiledRegexes, compiledRegex) + } + + return compiledRegexes +} + +func readLocalExclusionFile(file string) (regexes []string, err error) { + f, err := os.Open(file) + if err != nil { + return regexes, err + } + defer f.Close() + + scanner := bufio.NewScanner(f) + for scanner.Scan() { + regexes = append(regexes, scanner.Text()) + } + + if err := scanner.Err(); err != nil { + return regexes, err + } + + return regexes, nil +} + +func readRemoteExclusionFile(URL string) (regexes []string, err error) { + httpClient := &http.Client{ + Timeout: time.Second * 5, + } + + req, err := http.NewRequest(http.MethodGet, URL, nil) + if err != nil { + return regexes, err + } + + req.Header.Set("User-Agent", config.UserAgent) + + resp, err := httpClient.Do(req) + if err != nil { + return regexes, err + } + + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return regexes, fmt.Errorf("failed to download exclusion file: %s", resp.Status) + } + + // Read file line by line + scanner := bufio.NewScanner(resp.Body) + for scanner.Scan() { + regexes = append(regexes, scanner.Text()) + } + + if err := scanner.Err(); err != nil { + return regexes, err + } + + return regexes, nil +} + +func handleFlagsEdgeCases() { + if viper.GetBool("tui") { + // If live-stats is true, set no-stdout-log to true + viper.Set("no-stdout-log", true) + viper.Set("no-stderr-log", true) + } + + if viper.GetBool("prometheus") { + // If prometheus is true, set no-stdout-log to true + viper.Set("api", true) + } +} + +func handleFlagsAliases() { + // For each flag we want to alias, we check if the original flag is at default and if the alias is not + // If so, we set the original flag to the value of the alias + if viper.GetUint("hops") != 0 && viper.GetUint("max-hops") == 0 { + viper.Set("max-hops", viper.GetUint("hops")) + } + + if viper.GetInt("ca") != 1 && viper.GetInt("max-concurrent-assets") == 1 { + viper.Set("max-concurrent-assets", viper.GetInt("ca")) + } + + if viper.GetInt("msr") != 20 && viper.GetInt("min-space-required") == 20 { + viper.Set("min-space-required", viper.GetInt("msr")) + } +} diff --git a/internal/pkg/controler/channels.go b/internal/pkg/controler/channels.go new file mode 100644 index 00000000..7d5a6984 --- /dev/null +++ b/internal/pkg/controler/channels.go @@ -0,0 +1,29 @@ +package controler + +import "github.com/internetarchive/Zeno/pkg/models" + +var ( + stageChannels []chan *models.Item +) + +func makeStageChannel(bufferSize ...int) chan *models.Item { + var parsedSize int + + if len(bufferSize) == 0 { + parsedSize = 0 + } else if len(bufferSize) == 1 { + parsedSize = bufferSize[0] + } else { + panic("makeStageChannel: too many arguments, variadic argument should be omitted or a single integer") + } + + ch := make(chan *models.Item, parsedSize) + stageChannels = append(stageChannels, ch) + return ch +} + +func closeStageChannels() { + for _, ch := range stageChannels { + close(ch) + } +} diff --git a/internal/pkg/controler/controler.go b/internal/pkg/controler/controler.go new file mode 100644 index 00000000..4c39f252 --- /dev/null +++ b/internal/pkg/controler/controler.go @@ -0,0 +1,13 @@ +// Package controler provides a way to start and stop the pipeline. +package controler + +// Start initializes the pipeline. +func Start() { + startPipeline() +} + +// Stop stops the pipeline. +func Stop() { + stopPipeline() + closeStageChannels() +} diff --git a/internal/pkg/controler/pause/pause.go b/internal/pkg/controler/pause/pause.go new file mode 100644 index 00000000..55ac1cbc --- /dev/null +++ b/internal/pkg/controler/pause/pause.go @@ -0,0 +1,106 @@ +package pause + +import ( + "sync" + "sync/atomic" + + "github.com/internetarchive/Zeno/internal/pkg/stats" +) + +type ControlChans struct { + PauseCh chan struct{} + ResumeCh chan struct{} +} + +type pauseManager struct { + subscribers sync.Map // Map of *ControlChans to struct{} + isPaused atomic.Bool + message string +} + +var manager = &pauseManager{} + +// Subscribe returns a ControlChans struct for the subscriber to use. +func Subscribe() *ControlChans { + chans := &ControlChans{ + PauseCh: make(chan struct{}, 1), // Buffered to ensure non-blocking sends + ResumeCh: make(chan struct{}), // Unbuffered, will block on send + } + manager.subscribers.Store(chans, struct{}{}) + return chans +} + +// Unsubscribe removes the subscriber and closes its channels. +func Unsubscribe(chans *ControlChans) { + manager.subscribers.Delete(chans) + // Close channels safely (deferred to avoid panic if already closed). + defer func() { + recover() + }() + close(chans.PauseCh) + close(chans.ResumeCh) +} + +// Pause sends a pause signal to all subscribers. +func Pause(message ...string) { + swap := manager.isPaused.CompareAndSwap(false, true) + if !swap { + return + } + + if len(message) == 0 { + message = append(message, "Paused") + } + + manager.message = message[0] + + manager.subscribers.Range(func(key, _ interface{}) bool { + chans := key.(*ControlChans) + // Send pause signal (non-blocking since PauseCh is buffered). + select { + case chans.PauseCh <- struct{}{}: + // Signal sent. + default: + // PauseCh already has a signal. + } + return true + }) + stats.PausedSet() +} + +// Resume reads from each subscriber's ResumeCh to unblock them. +func Resume() { + var wg sync.WaitGroup + manager.subscribers.Range(func(key, _ interface{}) bool { + chans := key.(*ControlChans) + wg.Add(1) + go func(chans *ControlChans) { + defer wg.Done() + // Read from ResumeCh to unblock subscriber. + _, ok := <-chans.ResumeCh + if !ok { + // Channel closed; subscriber may have unsubscribed. + return + } + }(chans) + return true + }) + // Wait for all subscribers to send on their ResumeCh. + wg.Wait() + + swap := manager.isPaused.CompareAndSwap(true, false) + if !swap { + return + } + manager.message = "" + + stats.PausedReset() +} + +func IsPaused() bool { + return manager.isPaused.Load() +} + +func GetMessage() string { + return manager.message +} diff --git a/internal/pkg/controler/pause/pause_test.go b/internal/pkg/controler/pause/pause_test.go new file mode 100644 index 00000000..b22d429d --- /dev/null +++ b/internal/pkg/controler/pause/pause_test.go @@ -0,0 +1,429 @@ +package pause + +import ( + "context" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/internetarchive/Zeno/internal/pkg/stats" +) + +func TestBasicPauseResume(t *testing.T) { + stats.Init() + manager = &pauseManager{} + + var wg sync.WaitGroup + wg.Add(1) + + subscribed := make(chan struct{}) + pausedCh := make(chan struct{}) + resumedCh := make(chan struct{}) + + go func() { + defer wg.Done() + controlChans := Subscribe() + defer Unsubscribe(controlChans) + + subscribed <- struct{}{} + + for { + select { + case <-controlChans.PauseCh: + // Signal that we have received the pause signal + pausedCh <- struct{}{} + // Attempt to send to ResumeCh; blocks until Resume() reads from it. + controlChans.ResumeCh <- struct{}{} + // Signal that we have resumed + resumedCh <- struct{}{} + return // Exit after resuming. + default: + time.Sleep(10 * time.Millisecond) // Simulate work. + } + } + }() + + // Wait for the goroutine to subscribe + <-subscribed + + // Pause the system. + Pause() + + // Wait for the goroutine to signal that it has paused + select { + case <-pausedCh: + // Paused successfully + case <-time.After(200 * time.Millisecond): + t.Fatal("Subscriber did not receive pause signal") + } + + // Resume the system. + Resume() + + // Wait for the goroutine to signal that it has resumed + select { + case <-resumedCh: + // Resumed successfully + case <-time.After(200 * time.Millisecond): + t.Fatal("Subscriber did not resume") + } + + wg.Wait() +} + +func TestMultipleSubscribers(t *testing.T) { + stats.Init() + manager = &pauseManager{} + const numSubscribers = 10 + var wg sync.WaitGroup + + subscribedChans := make([]chan struct{}, numSubscribers) + pausedChans := make([]chan struct{}, numSubscribers) + resumedChans := make([]chan struct{}, numSubscribers) + + // Create multiple subscribers. + for i := 0; i < numSubscribers; i++ { + wg.Add(1) + subscribedChans[i] = make(chan struct{}) + pausedChans[i] = make(chan struct{}) + resumedChans[i] = make(chan struct{}) + + go func(idx int) { + defer wg.Done() + controlChans := Subscribe() + defer Unsubscribe(controlChans) + + subscribedChans[idx] <- struct{}{} + + for { + select { + case <-controlChans.PauseCh: + // Signal that we have paused + pausedChans[idx] <- struct{}{} + // Attempt to send to ResumeCh; blocks until Resume() reads from it. + controlChans.ResumeCh <- struct{}{} + // Signal that we have resumed + resumedChans[idx] <- struct{}{} + return // Exit after resuming. + default: + time.Sleep(10 * time.Millisecond) // Simulate work. + } + } + }(i) + } + + // Wait for all subscribers to subscribe + for i := 0; i < numSubscribers; i++ { + <-subscribedChans[i] + } + + // Pause the system. + Pause() + + // Wait for all subscribers to acknowledge the pause + for i := 0; i < numSubscribers; i++ { + select { + case <-pausedChans[i]: + // Subscriber paused + case <-time.After(100 * time.Millisecond): + t.Fatalf("Subscriber %d did not receive pause signal", i) + } + } + + // Resume the system. + Resume() + + // Wait for all subscribers to acknowledge the resume + for i := 0; i < numSubscribers; i++ { + select { + case <-resumedChans[i]: + // Subscriber resumed + case <-time.After(100 * time.Millisecond): + t.Fatalf("Subscriber %d did not resume", i) + } + } + + wg.Wait() +} + +func TestSubscriberUnsubscribeDuringPause(t *testing.T) { + stats.Init() + manager = &pauseManager{} + var wg sync.WaitGroup + wg.Add(1) + + subscribedCh := make(chan struct{}) + pausedCh := make(chan struct{}) + + go func() { + defer wg.Done() + controlChans := Subscribe() + defer Unsubscribe(controlChans) + + subscribedCh <- struct{}{} + + for { + select { + case <-controlChans.PauseCh: + // Signal that we have paused + pausedCh <- struct{}{} + // Unsubscribe during pause. + Unsubscribe(controlChans) + return + default: + time.Sleep(10 * time.Millisecond) // Simulate work. + } + } + }() + + // Wait for the subscriber to subscribe + <-subscribedCh + + // Pause the system. + Pause() + + // Wait for the subscriber to acknowledge the pause + select { + case <-pausedCh: + // Subscriber paused and unsubscribed + case <-time.After(100 * time.Millisecond): + t.Fatal("Subscriber did not receive pause signal") + } + + // Resume the system. + Resume() + time.Sleep(100 * time.Millisecond) // Allow any processing. + + wg.Wait() +} + +func TestConcurrentPauseResume(t *testing.T) { + stats.Init() + manager = &pauseManager{} + const numSubscribers = 5 + const numCycles = 10 + + var wg sync.WaitGroup + wg.Add(numSubscribers) + + // Channels to signal pause and resume completions + subscribedCh := make(chan struct{}) + pauseComplete := make(chan struct{}) + resumeComplete := make(chan struct{}) + + // Channel to receive counts from goroutines + countsCh := make(chan struct { + pauses int32 + resumes int32 + }, numSubscribers) + + // Create subscribers + for i := 0; i < numSubscribers; i++ { + go func() { + defer wg.Done() + controlChans := Subscribe() + defer Unsubscribe(controlChans) + + subscribedCh <- struct{}{} + + var pauses, resumes int32 + + for j := 0; j < numCycles; j++ { + // Wait for pause signal + <-controlChans.PauseCh + pauses++ + + // Signal that we've received the pause + pauseComplete <- struct{}{} + + // Block until resumed + controlChans.ResumeCh <- struct{}{} + resumes++ + + // Signal that we've resumed + resumeComplete <- struct{}{} + } + + // Send counts back to main goroutine + countsCh <- struct { + pauses int32 + resumes int32 + }{pauses, resumes} + }() + } + + // Wait for all subscribers to subscribe + for i := 0; i < numSubscribers; i++ { + <-subscribedCh + } + + // Perform pause and resume cycles + for i := 0; i < numCycles; i++ { + // Perform pause + Pause() + + // Wait for all subscribers to acknowledge the pause + for j := 0; j < numSubscribers; j++ { + <-pauseComplete + } + + // Perform resume + Resume() + + // Wait for all subscribers to acknowledge the resume + for j := 0; j < numSubscribers; j++ { + <-resumeComplete + } + } + + // Wait for all subscribers to finish + wg.Wait() + close(countsCh) + + // Verify that all subscribers have processed the correct number of pauses and resumes + for counts := range countsCh { + if counts.pauses != numCycles { + t.Fatalf("Subscriber expected to process %d pauses, but processed %d", numCycles, counts.pauses) + } + if counts.resumes != numCycles { + t.Fatalf("Subscriber expected to process %d resumes, but processed %d", numCycles, counts.resumes) + } + } +} + +func TestPauseResumeWithUnsubscribe(t *testing.T) { + stats.Init() + manager = &pauseManager{} + var wg sync.WaitGroup + wg.Add(1) + + subscribedCh := make(chan struct{}) + pausedCh := make(chan struct{}) + resumedCh := make(chan struct{}) + + go func() { + defer wg.Done() + controlChans := Subscribe() + subscribedCh <- struct{}{} + // Unsubscribe after resuming. + + for { + select { + case <-controlChans.PauseCh: + // Signal that we have paused + pausedCh <- struct{}{} + // Attempt to send to ResumeCh; blocks until Resume() reads from it. + controlChans.ResumeCh <- struct{}{} + // Signal that we have resumed + resumedCh <- struct{}{} + // Unsubscribe after resuming. + Unsubscribe(controlChans) + return + default: + time.Sleep(10 * time.Millisecond) // Simulate work. + } + } + }() + + // Wait for the subscriber to subscribe + <-subscribedCh + + // Pause the system. + Pause() + + // Wait for the subscriber to acknowledge pause + select { + case <-pausedCh: + // Subscriber paused + case <-time.After(100 * time.Millisecond): + t.Fatal("Subscriber did not receive pause signal") + } + + // Resume the system. + Resume() + + // Wait for the subscriber to acknowledge resume + select { + case <-resumedCh: + // Subscriber resumed + case <-time.After(100 * time.Millisecond): + t.Fatal("Subscriber did not resume") + } + + wg.Wait() +} + +func TestNoSubscribers(t *testing.T) { + stats.Init() + manager = &pauseManager{} + // Call Pause() and Resume() when there are no subscribers. + // If no panic occurs, the test passes. + Pause() + Resume() +} + +func TestPauseResumeE2E(t *testing.T) { + stats.Init() + manager = &pauseManager{} + var workCounter int32 // Counts the amount of work done. + var wg sync.WaitGroup + wg.Add(1) + + ctx, cancel := context.WithCancel(context.Background()) + + // Start the worker goroutine. + go func() { + controlChans := Subscribe() + defer Unsubscribe(controlChans) + defer wg.Done() + for { + select { + case <-ctx.Done(): + return + case <-controlChans.PauseCh: + // Attempt to send to ResumeCh; blocks until Resume() reads from it. + controlChans.ResumeCh <- struct{}{} + default: + // Simulate work. + atomic.AddInt32(&workCounter, 1) + time.Sleep(100 * time.Millisecond) + } + } + }() + + // Allow the worker to do some work. + time.Sleep(1 * time.Second) + workBeforePause := atomic.LoadInt32(&workCounter) + + // Pause the system. + Pause() + pauseStart := time.Now() + + // Sleep for 1 second to keep the system paused. + time.Sleep(1 * time.Second) + + // Resume the system. + Resume() + pauseDuration := time.Since(pauseStart) + + // Allow the worker to do more work. + time.Sleep(1 * time.Second) + workAfterResume := atomic.LoadInt32(&workCounter) + + // Calculate the amount of work done during the pause. + workDuringPause := workAfterResume - workBeforePause - 10 // Expected 10 units of work after resume. + + // Check that no work was done during the pause. + if workDuringPause != 0 { + t.Fatalf("Expected no work during pause, but got %d units of work", workDuringPause) + } + + // Verify that the pause duration is approximately 1 second. + if pauseDuration < 900*time.Millisecond || pauseDuration > 1100*time.Millisecond { + t.Fatalf("Expected pause duration around 1 second, but got %v", pauseDuration) + } + + cancel() + wg.Wait() +} diff --git a/internal/pkg/controler/pipeline.go b/internal/pkg/controler/pipeline.go new file mode 100644 index 00000000..02937798 --- /dev/null +++ b/internal/pkg/controler/pipeline.go @@ -0,0 +1,184 @@ +package controler + +import ( + "fmt" + "os" + "time" + + "github.com/google/uuid" + "github.com/internetarchive/Zeno/internal/pkg/archiver" + "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/internetarchive/Zeno/internal/pkg/controler/watchers" + "github.com/internetarchive/Zeno/internal/pkg/finisher" + "github.com/internetarchive/Zeno/internal/pkg/log" + "github.com/internetarchive/Zeno/internal/pkg/postprocessor" + "github.com/internetarchive/Zeno/internal/pkg/preprocessor" + "github.com/internetarchive/Zeno/internal/pkg/preprocessor/seencheck" + "github.com/internetarchive/Zeno/internal/pkg/reactor" + "github.com/internetarchive/Zeno/internal/pkg/source/hq" + "github.com/internetarchive/Zeno/internal/pkg/stats" + "github.com/internetarchive/Zeno/pkg/models" +) + +func startPipeline() { + if err := os.MkdirAll(config.Get().JobPath, 0755); err != nil { + fmt.Printf("can't create job directory: %s\n", err) + os.Exit(1) + } + + if err := watchers.CheckDiskUsage(config.Get().JobPath); err != nil { + fmt.Printf("can't start Zeno: %s\n", err) + os.Exit(1) + } + + err := log.Start() + if err != nil { + fmt.Println("error starting logger", "err", err.Error()) + panic(err) + } + + logger := log.NewFieldedLogger(&log.Fields{ + "component": "controler.StartPipeline", + }) + + err = stats.Init() + if err != nil { + logger.Error("error initializing stats", "err", err.Error()) + panic(err) + } + + // Start the disk watcher + go watchers.WatchDiskSpace(config.Get().JobPath, 5*time.Second) + + // Start the reactor that will receive + reactorOutputChan := makeStageChannel(config.Get().WorkersCount) + err = reactor.Start(config.Get().WorkersCount, reactorOutputChan) + if err != nil { + logger.Error("error starting reactor", "err", err.Error()) + panic(err) + } + + // If needed, create the seencheck DB (only if not using HQ) + if config.Get().UseSeencheck && !config.Get().UseHQ { + err := seencheck.Start(config.Get().JobPath) + if err != nil { + logger.Error("unable to start seencheck", "err", err.Error()) + panic(err) + } + } + + preprocessorOutputChan := makeStageChannel(config.Get().WorkersCount) + err = preprocessor.Start(reactorOutputChan, preprocessorOutputChan) + if err != nil { + logger.Error("error starting preprocessor", "err", err.Error()) + panic(err) + } + + archiverOutputChan := makeStageChannel(config.Get().WorkersCount) + err = archiver.Start(preprocessorOutputChan, archiverOutputChan) + if err != nil { + logger.Error("error starting archiver", "err", err.Error()) + panic(err) + } + + // Start the WARC writing queue watcher + go watchers.WatchWARCWritingQueue(5 * time.Second) + + postprocessorOutputChan := makeStageChannel(config.Get().WorkersCount) + err = postprocessor.Start(archiverOutputChan, postprocessorOutputChan) + if err != nil { + logger.Error("error starting postprocessor", "err", err.Error()) + panic(err) + } + + finisherFinishChan := makeStageChannel(config.Get().WorkersCount) + finisherProduceChan := makeStageChannel(config.Get().WorkersCount) + + if config.Get().UseHQ { + logger.Info("starting hq") + err = hq.Start(finisherFinishChan, finisherProduceChan) + if err != nil { + logger.Error("error starting hq source, retrying", "err", err.Error()) + panic(err) + } + } else { + // Means we're using the to-be-implemented local queue, for the moment we're just gonna consume the channels + go func() { + for { + select { + case _, ok := <-finisherFinishChan: + if !ok { + return + } + case _, ok := <-finisherProduceChan: + if !ok { + return + } + } + } + }() + } + + err = finisher.Start(postprocessorOutputChan, finisherFinishChan, finisherProduceChan) + if err != nil { + logger.Error("error starting finisher", "err", err.Error()) + panic(err) + } + + // Pipe in the reactor the input seeds if any + if len(config.Get().InputSeeds) > 0 { + for _, seed := range config.Get().InputSeeds { + parsedURL := &models.URL{Raw: seed} + err := parsedURL.Parse() + if err != nil { + panic(err) + } + + item := models.NewItem(uuid.New().String(), parsedURL, "") + item.SetSource(models.ItemSourceQueue) + + err = reactor.ReceiveInsert(item) + if err != nil { + logger.Error("unable to insert seed", "err", err.Error()) + panic(err) + } + } + } +} + +func stopPipeline() { + logger := log.NewFieldedLogger(&log.Fields{ + "component": "controler.stopPipeline", + }) + + watchers.StopDiskWatcher() + watchers.StopWARCWritingQueueWatcher() + + reactor.Freeze() + + preprocessor.Stop() + archiver.Stop() + postprocessor.Stop() + finisher.Stop() + + if config.Get().UseSeencheck && !config.Get().UseHQ { + seencheck.Close() + } + + if config.Get().UseHQ { + hq.Stop() + } + + reactor.Stop() + + if config.Get().WARCTempDir != "" { + err := os.Remove(config.Get().WARCTempDir) + if err != nil { + logger.Error("unable to remove temp dir", "err", err.Error()) + } + } + + logger.Info("done, logs are flushing and will be closed") + + log.Stop() +} diff --git a/internal/pkg/controler/signal.go b/internal/pkg/controler/signal.go new file mode 100644 index 00000000..ee59feb3 --- /dev/null +++ b/internal/pkg/controler/signal.go @@ -0,0 +1,38 @@ +package controler + +import ( + "context" + "os" + "os/signal" + "syscall" + + "github.com/internetarchive/Zeno/internal/pkg/log" +) + +var signalWatcherCtx, signalWatcherCancel = context.WithCancel(context.Background()) + +// WatchSignals listens for OS signals and handles them gracefully +func WatchSignals() { + logger := log.NewFieldedLogger(&log.Fields{ + "component": "controler.signalWatcher", + }) + // Handle OS signals for graceful shutdown + signalChan := make(chan os.Signal, 1) + signal.Notify(signalChan, syscall.SIGINT, syscall.SIGTERM) + + select { + case <-signalWatcherCtx.Done(): + return + case <-signalChan: + logger.Info("received shutdown signal, stopping services...") + // Catch a second signal to force exit + go func() { + <-signalChan + logger.Info("received second shutdown signal, forcing exit...") + os.Exit(1) + }() + + Stop() + os.Exit(0) + } +} diff --git a/internal/pkg/controler/watchers/disk.go b/internal/pkg/controler/watchers/disk.go new file mode 100644 index 00000000..a1d16c20 --- /dev/null +++ b/internal/pkg/controler/watchers/disk.go @@ -0,0 +1,98 @@ +package watchers + +import ( + "context" + "fmt" + "sync" + "syscall" + "time" + + "github.com/internetarchive/Zeno/internal/pkg/controler/pause" + "github.com/internetarchive/Zeno/internal/pkg/log" +) + +var ( + diskWatcherCtx, diskWatcherCancel = context.WithCancel(context.Background()) + diskWatcherWg sync.WaitGroup +) + +// Implements f(x)={ if total <= 256GB then threshold = 50GB * (total / 256GB) else threshold = 50GB } +func checkThreshold(total, free uint64) error { + const ( + GB = 1024 * 1024 * 1024 + ) + var threshold float64 + + if total <= 256*GB { + threshold = float64(50*GB) * (float64(total) / float64(256*GB)) + } else { + threshold = 50 * GB + } + + // Compare free space with threshold + if free < uint64(threshold) { + return fmt.Errorf("low disk space: free=%.2f GB, threshold=%.2f GB", float64(free)/1e9, float64(threshold)/1e9) + } + + return nil +} + +func CheckDiskUsage(path string) error { + var stat syscall.Statfs_t + if err := syscall.Statfs(path, &stat); err != nil { + panic(fmt.Sprintf("Error retrieving disk stats: %v\n", err)) + } + + total := stat.Blocks * uint64(stat.Bsize) + free := stat.Bavail * uint64(stat.Bsize) + + return checkThreshold(total, free) +} + +// WatchDiskSpace watches the disk space and pauses the pipeline if it's low +func WatchDiskSpace(path string, interval time.Duration) { + diskWatcherWg.Add(1) + defer diskWatcherWg.Done() + + logger := log.NewFieldedLogger(&log.Fields{ + "component": "controler.diskWatcher", + }) + + paused := false + returnASAP := false + ticker := time.NewTicker(interval) + defer ticker.Stop() + + for { + select { + case <-diskWatcherCtx.Done(): + defer logger.Debug("closed") + if paused { + logger.Info("returning after resume") + returnASAP = true + } + return + case <-ticker.C: + err := CheckDiskUsage(path) + + if err != nil && !paused { + logger.Warn("Low disk space, pausing the pipeline", "err", err.Error()) + pause.Pause("Not enough disk space!!!") + paused = true + } else if err == nil && paused { + logger.Info("Disk space is sufficient, resuming the pipeline") + pause.Resume() + paused = false + if returnASAP { + return + } + } + } + } +} + +// StopDiskWatcher stops the disk watcher by canceling the context and waiting for the goroutine to finish. +func StopDiskWatcher() { + diskWatcherCancel() + diskWatcherWg.Wait() +} diff --git a/internal/pkg/controler/watchers/disk_test.go b/internal/pkg/controler/watchers/disk_test.go new file mode 100644 index 00000000..3f8bdcdc --- /dev/null +++ b/internal/pkg/controler/watchers/disk_test.go @@ -0,0 +1,54 @@ +package watchers + +import ( + "testing" +) + +func TestCheckThreshold(t *testing.T) { + tests := []struct { + name string + total uint64 + free uint64 + wantError bool + }{ + { + name: "Low disk space on large disk", + total: 300 * 1024 * 1024 * 1024, // 300 GiB + free: 15 * 1024 * 1024 * 1024, // 15 GiB + wantError: true, + }, + { + name: "Sufficient disk space on large disk", + total: 300 * 1024 * 1024 * 1024, // 300 GiB + free: 50 * 1024 * 1024 * 1024, // 50 GiB + wantError: false, + }, + { + name: "Low disk space on small disk", + total: 100 * 1024 * 1024 * 1024, // 100 GiB + free: 3 * 1024 * 1024 * 1024, // 3 GiB + wantError: true, + }, + { + name: "Sufficient disk space on small disk", + total: 100 * 1024 * 1024 * 1024, // 100 GiB + free: 60 * 1024 * 1024 * 1024, // 10 GiB + wantError: false, + }, + { + name: "Edge case: exactly at threshold for small disk", + total: 300 * 1024 * 1024 * 1024, // 200 GiB + free: uint64((50 * 1024 * 1024 * 1024) * (float64(300*1024*1024*1024) / float64(256*1024*1024*1024))), // Threshold value + wantError: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := checkThreshold(tt.total, tt.free) + if (err != nil) != tt.wantError { + t.Errorf("checkThreshold() error = %v, wantError %v", err, tt.wantError) + } + }) + } +} diff --git a/internal/pkg/controler/watchers/warc.go b/internal/pkg/controler/watchers/warc.go new file mode 100644 index 00000000..b41396b9 --- /dev/null +++ b/internal/pkg/controler/watchers/warc.go @@ -0,0 +1,67 @@ +package watchers + +import ( + "context" + "sync" + "time" + + "github.com/internetarchive/Zeno/internal/pkg/archiver" + "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/internetarchive/Zeno/internal/pkg/controler/pause" + "github.com/internetarchive/Zeno/internal/pkg/log" +) + +var ( + wwqCtx, wwqCancel = context.WithCancel(context.Background()) + wwqWg sync.WaitGroup +) + +// WatchWARCWritingQueue watches the WARC writing queue size and pauses the pipeline if it exceeds the worker count +func WatchWARCWritingQueue(interval time.Duration) { + wwqWg.Add(1) + defer wwqWg.Done() + + logger := log.NewFieldedLogger(&log.Fields{ + "component": "controler.warcWritingQueueWatcher", + }) + + paused := false + returnASAP := false + ticker := time.NewTicker(interval) + defer ticker.Stop() + + for { + select { + case <-wwqCtx.Done(): + defer logger.Debug("closed") + if paused { + logger.Info("returning after resume") + returnASAP = true + } + return + case <-ticker.C: + queueSize := archiver.GetWARCWritingQueueSize() + + logger.Debug("checking queue size", "queue_size", queueSize, "max_queue_size", config.Get().WorkersCount, "paused", paused) + + if queueSize > config.Get().WorkersCount && !paused { + logger.Warn("WARC writing queue exceeded the worker count, pausing the pipeline") + pause.Pause("WARC writing queue exceeded the worker count") + paused = true + } else if queueSize < config.Get().WorkersCount && paused { + logger.Info("WARC writing queue size returned to acceptable, resuming the pipeline") + pause.Resume() + paused = false + if returnASAP { + return + } + } + } + } +} + +// StopWARCWritingQueueWatcher stops the WARC writing queue watcher by canceling the context and waiting for the goroutine to finish +func StopWARCWritingQueueWatcher() { + wwqCancel() + wwqWg.Wait() +} diff --git a/internal/pkg/crawl/dependencies/ytdlp/model.go b/internal/pkg/crawl/dependencies/ytdlp/model.go deleted file mode 100644 index c0e2c503..00000000 --- a/internal/pkg/crawl/dependencies/ytdlp/model.go +++ /dev/null @@ -1,114 +0,0 @@ -package ytdlp - -type Subtitle struct { - Ext string `json:"ext"` - URL string `json:"url"` - Name string `json:"name"` -} - -type Video struct { - ID string `json:"id"` - Title string `json:"title"` - Channel string `json:"channel"` - ChannelID string `json:"channel_id"` - ChannelURL string `json:"channel_url"` - Description string `json:"description"` - Timestamp int `json:"timestamp"` - Duration float64 `json:"duration"` - ViewCount float64 `json:"view_count"` - Tags []string `json:"tags"` - Categories []string `json:"categories"` - Thumbnail string `json:"thumbnail"` - Language string `json:"language"` - IsLive bool `json:"is_live"` - Subtitles map[string][]Subtitle `json:"subtitles"` - RequestedFormats []struct { - Acodec string `json:"acodec"` - AspectRatio float64 `json:"aspect_ratio"` - Asr interface{} `json:"asr"` - AudioChannels interface{} `json:"audio_channels"` - AudioExt string `json:"audio_ext"` - Container string `json:"container"` - DynamicRange string `json:"dynamic_range"` - Ext string `json:"ext"` - Filesize float64 `json:"filesize"` - Format string `json:"format"` - FormatID string `json:"format_id"` - FormatNote string `json:"format_note"` - Fps float64 `json:"fps"` - Fragments []struct { - URL string `json:"url"` - } `json:"fragments"` - HasDrm bool `json:"has_drm"` - Height float64 `json:"height"` - HTTPHeaders map[string]string `json:"http_headers"` - Language interface{} `json:"language"` - LanguagePreference float64 `json:"language_preference"` - Preference interface{} `json:"preference"` - Protocol string `json:"protocol"` - Quality float64 `json:"quality"` - Resolution string `json:"resolution"` - SourcePreference float64 `json:"source_preference"` - Tbr float64 `json:"tbr"` - URL string `json:"url"` - Vbr float64 `json:"vbr,omitempty"` - Vcodec string `json:"vcodec"` - VideoExt string `json:"video_ext"` - Width float64 `json:"width"` - Abr float64 `json:"abr,omitempty"` - } `json:"requested_formats"` - Formats []struct { - Acodec string `json:"acodec"` - AspectRatio float64 `json:"aspect_ratio"` - AudioExt string `json:"audio_ext"` - Columns float64 `json:"columns,omitempty"` - Ext string `json:"ext"` - Format string `json:"format"` - FormatID string `json:"format_id"` - FormatNote string `json:"format_note"` - Fps float64 `json:"fps"` - Fragments []struct { - Duration float64 `json:"duration"` - URL string `json:"url"` - } `json:"fragments,omitempty"` - Height float64 `json:"height"` - HTTPHeaders struct { - Accept string `json:"Accept"` - AcceptLanguage string `json:"Accept-Language"` - SecFetchMode string `json:"Sec-Fetch-Mode"` - UserAgent string `json:"User-Agent"` - } `json:"http_headers"` - Protocol string `json:"protocol"` - Resolution string `json:"resolution"` - Rows float64 `json:"rows,omitempty"` - URL string `json:"url"` - Vcodec string `json:"vcodec"` - VideoExt string `json:"video_ext"` - Width float64 `json:"width"` - Abr float64 `json:"abr,omitempty"` - Asr float64 `json:"asr,omitempty"` - AudioChannels float64 `json:"audio_channels,omitempty"` - Container string `json:"container,omitempty"` - DynamicRange interface{} `json:"dynamic_range,omitempty"` - Filesize float64 `json:"filesize,omitempty"` - HasDrm bool `json:"has_drm,omitempty"` - Language string `json:"language,omitempty"` - LanguagePreference float64 `json:"language_preference,omitempty"` - Preference interface{} `json:"preference,omitempty"` - Quality float64 `json:"quality,omitempty"` - SourcePreference float64 `json:"source_preference,omitempty"` - Tbr float64 `json:"tbr,omitempty"` - Vbr float64 `json:"vbr,omitempty"` - FilesizeApprox float64 `json:"filesize_approx,omitempty"` - } `json:"formats"` - Thumbnails []struct { - URL string `json:"url"` - } `json:"thumbnails"` -} - -type HTTPHeaders struct { - Accept string `json:"Accept"` - AcceptLanguage string `json:"Accept-Language"` - SecFetchMode string `json:"Sec-Fetch-Mode"` - UserAgent string `json:"User-Agent"` -} diff --git a/internal/pkg/crawl/dependencies/ytdlp/parse.go b/internal/pkg/crawl/dependencies/ytdlp/parse.go deleted file mode 100644 index 1c905a66..00000000 --- a/internal/pkg/crawl/dependencies/ytdlp/parse.go +++ /dev/null @@ -1,42 +0,0 @@ -package ytdlp - -import ( - "io" - "net/url" -) - -func Parse(body io.ReadCloser) (streamURLs, metaURLs []*url.URL, rawJSON string, HTTPHeaders map[string]string, err error) { - // Create a temporary server to serve the body and call ytdlp on it - port, stopChan, err := serveBody(body) - if err != nil { - return streamURLs, metaURLs, rawJSON, HTTPHeaders, err - } - defer close(stopChan) - - // Call ytdlp on the temporary server - rawStreamURLs, rawMetaURLs, rawJSON, HTTPHeaders, err := getJSON(port) - if err != nil { - return streamURLs, metaURLs, rawJSON, HTTPHeaders, err - } - - // Range over rawStreamURLs and rawMetaURLs to parse them as url.URL in videoURLs and metaURLs - for _, urlString := range rawStreamURLs { - URL, err := url.Parse(urlString) - if err != nil { - return streamURLs, metaURLs, rawJSON, HTTPHeaders, err - } - - streamURLs = append(streamURLs, URL) - } - - for _, urlString := range rawMetaURLs { - URL, err := url.Parse(urlString) - if err != nil { - return streamURLs, metaURLs, rawJSON, HTTPHeaders, err - } - - metaURLs = append(metaURLs, URL) - } - - return streamURLs, metaURLs, rawJSON, HTTPHeaders, nil -} diff --git a/internal/pkg/crawl/dependencies/ytdlp/server.go b/internal/pkg/crawl/dependencies/ytdlp/server.go deleted file mode 100644 index 4d0e34c4..00000000 --- a/internal/pkg/crawl/dependencies/ytdlp/server.go +++ /dev/null @@ -1,46 +0,0 @@ -package ytdlp - -import ( - "io" - "net" - "net/http" - "strings" -) - -func serveBody(body io.ReadCloser) (port int, stopChan chan struct{}, err error) { - stopChan = make(chan struct{}) - portChan := make(chan int) - - bodyBytes, err := io.ReadAll(body) - if err != nil { - return 0, nil, err - } - - // Start the server - go func() { - // Serve the body on the random port - listener, err := net.Listen("tcp", "127.0.0.1:0") - if err != nil { - panic(err) - } - defer listener.Close() - - portChan <- listener.Addr().(*net.TCPAddr).Port - - go func() { - <-stopChan - listener.Close() - }() - - // Create a handler that will serve the body on / - handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.Write(bodyBytes) - }) - - if err := http.Serve(listener, handler); err != nil && !strings.Contains(err.Error(), "use of closed network connection") { - return - } - }() - - return <-portChan, stopChan, nil -} diff --git a/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go b/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go deleted file mode 100644 index 1d628247..00000000 --- a/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go +++ /dev/null @@ -1,95 +0,0 @@ -package ytdlp - -import ( - "bytes" - "encoding/json" - "fmt" - "os/exec" - "strconv" -) - -func getJSON(port int) (streamURLs, metaURLs []string, rawJSON string, HTTPHeaders map[string]string, err error) { - HTTPHeaders = make(map[string]string) - - // Prepare the command - cmd := exec.Command("yt-dlp", "http://localhost:"+strconv.Itoa(port), "--dump-json", "-f", "bv[protocol=https]+ba[protocol=https]") - - // Buffers to capture stdout and stderr - var stdout, stderr bytes.Buffer - cmd.Stdout = &stdout - cmd.Stderr = &stderr - - // Run the command - err = cmd.Run() - if err != nil { - return streamURLs, metaURLs, rawJSON, HTTPHeaders, fmt.Errorf("yt-dlp error: %v\nstderr: %s", err, stderr.String()) - } - - output := stdout.String() - - // Parse the output as a Video object - var video Video - err = json.Unmarshal([]byte(output), &video) - if err != nil { - return streamURLs, metaURLs, rawJSON, HTTPHeaders, fmt.Errorf("error unmarshaling yt-dlp JSON: %v", err) - } - - // Get the manifest URL for the best video & audio quality - // Note: we do not archive live streams - if !video.IsLive { - if len(video.RequestedFormats) > 0 { - HTTPHeaders = video.RequestedFormats[0].HTTPHeaders - for _, format := range video.RequestedFormats { - // Choose stream_type= - // If acodec == "none" and vcodec != "none", it's "video" - // If acodec != "none" and vcodec == "none", it's "audio" - // If acodec != "none" and vcodec != "none", we don't specify stream_type - var streamType string - if format.Acodec == "none" && format.Vcodec != "none" { - streamType = "video" - } else if format.Acodec != "none" && format.Vcodec == "none" { - streamType = "audio" - } - - var URL = format.URL + "&video_id=" + video.ID - if streamType != "" { - URL += "&stream_type=" + streamType - } - - streamURLs = append(streamURLs, URL) - } - } - } - - // Get all subtitles (not automatic captions) - for _, subtitle := range video.Subtitles { - for _, sub := range subtitle { - metaURLs = append(metaURLs, sub.URL) - } - } - - // Get all thumbnail URLs - for _, thumbnail := range video.Thumbnails { - metaURLs = append(metaURLs, thumbnail.URL) - } - - // Get the storyboards - for _, format := range video.Formats { - if format.FormatNote == "storyboard" { - metaURLs = append(metaURLs, format.URL) - for _, fragment := range format.Fragments { - metaURLs = append(metaURLs, fragment.URL) - } - } - } - - return streamURLs, metaURLs, output, HTTPHeaders, nil -} - -func FindPath() (string, bool) { - path, err := exec.LookPath("yt-dlp") - if err != nil { - return "", false - } - return path, true -} diff --git a/internal/pkg/crawl/hq.go b/internal/pkg/crawl/hq.go deleted file mode 100644 index 8a9b9649..00000000 --- a/internal/pkg/crawl/hq.go +++ /dev/null @@ -1,387 +0,0 @@ -package crawl - -import ( - "math" - "net/url" - "strings" - "sync" - "time" - - "github.com/internetarchive/Zeno/internal/pkg/queue" - "github.com/internetarchive/Zeno/internal/pkg/utils" - "github.com/internetarchive/gocrawlhq" -) - -// This function connects to HQ's websocket and listen for messages. -// It also sends and "identify" message to the HQ to let it know that -// Zeno is connected. This "identify" message is sent every second and -// contains the crawler's stats and details. -func (c *Crawl) HQWebsocket() { - var ( - // the "identify" message will be sent every second - // to the crawl HQ - identifyTicker = time.NewTicker(time.Second) - ) - - defer func() { - identifyTicker.Stop() - }() - - // send an "identify" message to the crawl HQ every second - for { - err := c.HQClient.Identify(&gocrawlhq.IdentifyMessage{ - Project: c.HQProject, - Job: c.Job, - IP: utils.GetOutboundIP().String(), - Hostname: utils.GetHostname(), - GoVersion: utils.GetVersion().GoVersion, - }) - if err != nil { - c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error sending identify payload to crawl HQ, trying to reconnect..") - - err = c.HQClient.InitWebsocketConn() - if err != nil { - c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error initializing websocket connection to crawl HQ") - } - } - - <-identifyTicker.C - } -} - -func (c *Crawl) HQProducer() { - defer c.HQChannelsWg.Done() - - var ( - discoveredArray = []gocrawlhq.URL{} - mutex = sync.Mutex{} - terminateProducer = make(chan bool) - ) - - // the discoveredArray is sent to the crawl HQ every 10 seconds - // or when it reaches a certain size - go func() { - HQLastSent := time.Now() - - for { - select { - case <-terminateProducer: - // no need to lock the mutex here, because the producer channel - // is already closed, so no other goroutine can write to the slice - if len(discoveredArray) > 0 { - for { - err := c.HQClient.Add(discoveredArray, false) - if err != nil { - c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error sending payload to crawl HQ, waiting 1s then retrying..") - time.Sleep(time.Second) - continue - } - break - } - } - - return - default: - mutex.Lock() - if (len(discoveredArray) >= int(math.Ceil(float64(c.Workers.Count)/2)) || time.Since(HQLastSent) >= time.Second*10) && len(discoveredArray) > 0 { - for { - err := c.HQClient.Add(discoveredArray, false) - if err != nil { - c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error sending payload to crawl HQ, waiting 1s then retrying..") - time.Sleep(time.Second) - continue - } - break - } - - discoveredArray = []gocrawlhq.URL{} - HQLastSent = time.Now() - } - mutex.Unlock() - } - } - }() - - // listen to the discovered channel and add the URLs to the discoveredArray - for discoveredItem := range c.HQProducerChannel { - var via string - - if discoveredItem.ParentURL != nil { - via = utils.URLToString(discoveredItem.ParentURL) - } - - discoveredURL := gocrawlhq.URL{ - Value: utils.URLToString(discoveredItem.URL), - Via: via, - } - - for i := uint64(0); i < discoveredItem.Hop; i++ { - discoveredURL.Path += "L" - } - - // The reason we are using a string instead of a bool is because - // gob's encode/decode doesn't properly support booleans - if discoveredItem.BypassSeencheck { - for { - err := c.HQClient.Add([]gocrawlhq.URL{discoveredURL}, true) - if err != nil { - c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ - "bypassSeencheck": discoveredItem.BypassSeencheck, - })).Error("error sending payload to crawl HQ, waiting 1s then retrying..") - time.Sleep(time.Second) - continue - } - break - } - continue - } - - mutex.Lock() - discoveredArray = append(discoveredArray, discoveredURL) - mutex.Unlock() - } - - // if we are here, it means that the HQProducerChannel has been closed - // so we need to send the last payload to the crawl HQ - terminateProducer <- true -} - -func (c *Crawl) HQConsumer() { - for { - c.HQConsumerState = "running" - - // This is on purpose evaluated every time, - // because the value of workers will maybe change - // during the crawl in the future (to be implemented) - var HQBatchSize = int(c.Workers.Count) - - if c.Finished.Get() { - c.HQConsumerState = "finished" - c.Log.Error("crawl finished, stopping HQ consumer") - break - } - - // If HQContinuousPull is set to true, we will pull URLs from HQ continuously, - // otherwise we will only pull URLs when needed (and when the crawl is not paused) - for (c.Queue.GetStats().TotalElements > HQBatchSize && !c.HQContinuousPull) || c.Paused.Get() || c.Queue.HandoverOpen.Get() { - c.HQConsumerState = "waiting" - c.Log.Info("HQ producer waiting", "paused", c.Paused.Get(), "handoverOpen", c.Queue.HandoverOpen.Get(), "queueSize", c.Queue.GetStats().TotalElements) - time.Sleep(time.Millisecond * 50) - continue - } - - // If a specific HQ batch size is set, use it - if c.HQBatchSize != 0 { - HQBatchSize = c.HQBatchSize - } - - // get batch from crawl HQ - c.HQConsumerState = "waitingOnFeed" - var URLs []gocrawlhq.URL - var err error - if c.HQBatchConcurrency == 1 { - URLs, err = c.HQClient.Get(HQBatchSize, c.HQStrategy) - if err != nil { - // c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ - // "batchSize": HQBatchSize, - // "err": err, - // })).Debug("error getting new URLs from crawl HQ") - continue - } - } else { - var mu sync.Mutex - var wg sync.WaitGroup - batchSize := HQBatchSize / c.HQBatchConcurrency - URLsChan := make(chan []gocrawlhq.URL, c.HQBatchConcurrency) - - // Start goroutines to get URLs from crawl HQ, each will request - // HQBatchSize / HQConcurrentBatch URLs - for i := 0; i < c.HQBatchConcurrency; i++ { - wg.Add(1) - go func() { - defer wg.Done() - URLs, err := c.HQClient.Get(batchSize, c.HQStrategy) - if err != nil { - // c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ - // "batchSize": batchSize, - // "err": err, - // })).Debug("error getting new URLs from crawl HQ") - return - } - URLsChan <- URLs - }() - } - - // Wait for all goroutines to finish - go func() { - wg.Wait() - close(URLsChan) - }() - - // Collect all URLs from the channels - for URLsFromChan := range URLsChan { - mu.Lock() - URLs = append(URLs, URLsFromChan...) - mu.Unlock() - } - } - c.HQConsumerState = "feedCompleted" - - // send all URLs received in the batch to the queue - var items = make([]*queue.Item, 0, len(URLs)) - if len(URLs) > 0 { - for _, URL := range URLs { - c.HQConsumerState = "urlParse" - newURL, err := url.Parse(URL.Value) - if err != nil { - c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ - "url": URL.Value, - "batchSize": HQBatchSize, - "err": err, - })).Error("unable to parse URL received from crawl HQ, discarding") - continue - } - - c.HQConsumerState = "newItem" - newItem, err := queue.NewItem(newURL, nil, "seed", uint64(strings.Count(URL.Path, "L")), URL.ID, false) - if err != nil { - c.Log.WithFields(c.genLogFields(err, newURL, map[string]interface{}{ - "url": URL.Value, - "batchSize": HQBatchSize, - "err": err, - })).Error("unable to create new item from URL received from crawl HQ, discarding") - continue - } - - c.HQConsumerState = "append" - items = append(items, newItem) - } - } - - c.HQConsumerState = "enqueue" - err = c.Queue.BatchEnqueue(items...) - if err != nil { - c.Log.Error("unable to enqueue URL batch received from crawl HQ, discarding", "error", err) - continue - } - } -} - -func (c *Crawl) HQFinisher() { - defer c.HQChannelsWg.Done() - - var ( - finishedArray = []gocrawlhq.URL{} - locallyCrawledTotal int - ) - - for finishedItem := range c.HQFinishedChannel { - if finishedItem.ID == "" { - c.Log.WithFields(c.genLogFields(nil, finishedItem.URL, nil)).Warn("URL has no ID, discarding") - continue - } - - locallyCrawledTotal += int(finishedItem.LocallyCrawled) - finishedArray = append(finishedArray, gocrawlhq.URL{ID: finishedItem.ID, Value: utils.URLToString(finishedItem.URL)}) - - if len(finishedArray) == int(math.Ceil(float64(c.Workers.Count)/2)) { - for { - err := c.HQClient.Delete(finishedArray, locallyCrawledTotal) - if err != nil { - c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ - "finishedArray": finishedArray, - })).Error("error submitting finished urls to crawl HQ. retrying in one second...") - time.Sleep(time.Second) - continue - } - break - } - - finishedArray = []gocrawlhq.URL{} - locallyCrawledTotal = 0 - } - } - - // send remaining finished URLs - if len(finishedArray) > 0 { - for { - err := c.HQClient.Delete(finishedArray, locallyCrawledTotal) - if err != nil { - c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ - "finishedArray": finishedArray, - })).Error("error submitting finished urls to crawl HQ. retrying in one second...") - time.Sleep(time.Second) - continue - } - break - } - } -} - -func (c *Crawl) HQSeencheckURLs(URLs []*url.URL) (seencheckedBatch []*url.URL, err error) { - var ( - discoveredURLs []gocrawlhq.URL - ) - - for _, URL := range URLs { - discoveredURLs = append(discoveredURLs, gocrawlhq.URL{ - Value: utils.URLToString(URL), - Type: "asset", - }) - } - - outputURLs, err := c.HQClient.Seencheck(discoveredURLs) - if err != nil { - c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ - "batchLen": len(URLs), - "urls": discoveredURLs, - })).Error("error sending seencheck payload to crawl HQ") - return seencheckedBatch, err - } - - if outputURLs != nil { - for _, URL := range outputURLs { - // the returned payload only contain new URLs to be crawled by Zeno - newURL, err := url.Parse(URL.Value) - if err != nil { - c.Log.WithFields(c.genLogFields(err, URL, map[string]interface{}{ - "batchLen": len(URLs), - })).Error("error parsing URL from HQ seencheck response") - return seencheckedBatch, err - } - - seencheckedBatch = append(seencheckedBatch, newURL) - } - } - - return seencheckedBatch, nil -} - -// returns: -// - bool: true if the URL is new, false if it has been seen before -// - error: if there's an error sending the payload to crawl HQ -// -// NOTE: if there's an error, the URL is considered new -func (c *Crawl) HQSeencheckURL(URL *url.URL) (bool, error) { - discoveredURL := gocrawlhq.URL{ - Value: utils.URLToString(URL), - Type: "asset", - } - - outputURLs, err := c.HQClient.Seencheck([]gocrawlhq.URL{discoveredURL}) - if err != nil { - c.Log.Error("error sending seencheck payload to crawl HQ", "err", err, "url", utils.URLToString(URL)) - return true, err // return true, don't discard the URL if there's an error - } - - if outputURLs != nil { - for _, URL := range outputURLs { - // the returned payload only contain new URLs to be crawled by Zeno - if URL.Value == discoveredURL.Value { - return true, nil - } - } - } - - return false, nil -} diff --git a/internal/pkg/crawl/link_header_test.go b/internal/pkg/crawl/link_header_test.go deleted file mode 100644 index 2d165721..00000000 --- a/internal/pkg/crawl/link_header_test.go +++ /dev/null @@ -1,100 +0,0 @@ -package crawl - -import ( - "slices" - "testing" -) - -func TestParseOneLink(t *testing.T) { - var links []Link - links = append(links, Link{URL: "https://one.example.com", Rel: "preconnect"}) - - var link = `; rel="preconnect"` - - got := Parse(link) - want := links - - if !slices.Equal(got, want) { - t.Fatalf("got %q, wanted %q", got, want) - } -} - -func TestParseMultipleLinks(t *testing.T) { - var links []Link - links = append(links, - Link{URL: "https://test.com", Rel: "preconnect"}, - Link{URL: "https://app.test.com", Rel: "preconnect"}, - Link{URL: "https://example.com", Rel: "preconnect"}, - ) - - var link = `; rel="preconnect", ; rel="preconnect"; foo="bar", ; rel="preconnect"` - - got := Parse(link) - want := links - - if !slices.Equal(got, want) { - t.Fatalf("got %q, wanted %q", got, want) - } -} - -func TestParseOneMalformedLink(t *testing.T) { - var links []Link - links = append(links, Link{URL: "https://one.example.com", Rel: "preconnect"}) - - var link = `https://one.example.com>;; rel=preconnect";` - - got := Parse(link) - want := links - - if !slices.Equal(got, want) { - t.Fatalf("got %q, wanted %q", got, want) - } -} - -func TestParseMultipleMalformedLinks(t *testing.T) { - var links []Link - links = append(links, - Link{URL: "", Rel: "preconnect"}, - Link{URL: "https://app.test.com", Rel: ""}, - Link{URL: "", Rel: ""}, - ) - - var link = `; rel="preconnect", https://app.test.com; rel=""; "bar", <>; ="preconnect"` - - got := Parse(link) - want := links - - if !slices.Equal(got, want) { - t.Fatalf("got %q, wanted %q", got, want) - } -} - -func TestParseAttr(t *testing.T) { - attr := `rel="preconnect"` - - gotKey, gotValue := ParseAttr(attr) - wantKey, wantValue := "rel", "preconnect" - - if gotKey != wantKey { - t.Fatalf("got %q, wanted %q", gotKey, wantKey) - } - - if gotValue != wantValue { - t.Fatalf("got %q, wanted %q", gotValue, wantValue) - } -} - -func TestParseMalformedAttr(t *testing.T) { - attr := `="preconnect"` - - gotKey, gotValue := ParseAttr(attr) - wantKey, wantValue := "", "preconnect" - - if gotKey != wantKey { - t.Fatalf("got %q, wanted %q", gotKey, wantKey) - } - - if gotValue != wantValue { - t.Fatalf("got %q, wanted %q", gotValue, wantValue) - } -} diff --git a/internal/pkg/crawl/sitespecific/cloudflarestream/cloudflarestream.go b/internal/pkg/crawl/sitespecific/cloudflarestream/cloudflarestream.go deleted file mode 100644 index 8951b9ee..00000000 --- a/internal/pkg/crawl/sitespecific/cloudflarestream/cloudflarestream.go +++ /dev/null @@ -1,454 +0,0 @@ -package cloudflarestream - -import ( - "encoding/xml" - "errors" - "io" - "io/ioutil" - "math" - "net/url" - "strconv" - "strings" - - "github.com/CorentinB/warc" - "github.com/PuerkitoBio/goquery" - "github.com/internetarchive/Zeno/internal/pkg/utils" -) - -type MPD struct { - XMLName xml.Name `xml:"MPD"` - Text string `xml:",chardata"` - Xmlns string `xml:"xmlns,attr"` - Profiles string `xml:"profiles,attr"` - Type string `xml:"type,attr"` - MediaPresentationDuration string `xml:"mediaPresentationDuration,attr"` - MinBufferTime string `xml:"minBufferTime,attr"` - Period struct { - Text string `xml:",chardata"` - ID string `xml:"id,attr"` - AdaptationSet []struct { - Text string `xml:",chardata"` - ID string `xml:"id,attr"` - MimeType string `xml:"mimeType,attr"` - SegmentAlignment string `xml:"segmentAlignment,attr"` - Lang string `xml:"lang,attr"` - Representation []struct { - Text string `xml:",chardata"` - ID string `xml:"id,attr"` - AudioSamplingRate string `xml:"audioSamplingRate,attr"` - Bandwidth string `xml:"bandwidth,attr"` - Codecs string `xml:"codecs,attr"` - FrameRate string `xml:"frameRate,attr"` - Height string `xml:"height,attr"` - Width string `xml:"width,attr"` - AudioChannelConfiguration struct { - Text string `xml:",chardata"` - SchemeIdUri string `xml:"schemeIdUri,attr"` - Value string `xml:"value,attr"` - } `xml:"AudioChannelConfiguration"` - SegmentTemplate struct { - Text string `xml:",chardata"` - Duration string `xml:"duration,attr"` - Initialization string `xml:"initialization,attr"` - Media string `xml:"media,attr"` - StartNumber string `xml:"startNumber,attr"` - Timescale string `xml:"timescale,attr"` - } `xml:"SegmentTemplate"` - } `xml:"Representation"` - } `xml:"AdaptationSet"` - } `xml:"Period"` -} - -func IsURL(URL string) bool { - return strings.Contains(URL, "cloudflarestream.com") -} - -func GetJSFiles(doc *goquery.Document, watchPageURL *url.URL, httpClient warc.CustomHTTPClient) (archivedURLs []string, err error) { - var latestJSURL string - - // Look for the $10,000 Every Day You Survive In A Grocery Store - YouTube
PrésentationPresseDroits d'auteurNous contacterCréateursPublicitéDéveloppeursRésilier vos abonnementsConditions d'utilisationConfidentialitéRègles et sécuritéPremiers pas sur YouTubeTester de nouvelles fonctionnalités
\ No newline at end of file diff --git a/internal/pkg/finisher/error.go b/internal/pkg/finisher/error.go new file mode 100644 index 00000000..140c3a68 --- /dev/null +++ b/internal/pkg/finisher/error.go @@ -0,0 +1,10 @@ +package finisher + +import "errors" + +var ( + // ErrFinisherAlreadyInitialized is the error returned when the finisher is already initialized + ErrFinisherAlreadyInitialized = errors.New("finisher already initialized") + // ErrFinisherNotInitialized is the error returned when the finisher is not initialized + ErrFinisherNotInitialized = errors.New("finisher not initialized") +) diff --git a/internal/pkg/finisher/finisher.go b/internal/pkg/finisher/finisher.go new file mode 100644 index 00000000..e1856da1 --- /dev/null +++ b/internal/pkg/finisher/finisher.go @@ -0,0 +1,151 @@ +package finisher + +import ( + "context" + "fmt" + "strconv" + "sync" + + "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/internetarchive/Zeno/internal/pkg/controler/pause" + "github.com/internetarchive/Zeno/internal/pkg/log" + "github.com/internetarchive/Zeno/internal/pkg/reactor" + "github.com/internetarchive/Zeno/internal/pkg/stats" + "github.com/internetarchive/Zeno/pkg/models" +) + +type finisher struct { + ctx context.Context + cancel context.CancelFunc + inputCh chan *models.Item + sourceFinishedCh chan *models.Item + sourceProducedCh chan *models.Item + wg sync.WaitGroup +} + +var ( + globalFinisher *finisher + once sync.Once + logger *log.FieldedLogger +) + +// Start initializes the global finisher with the given input channel. +// This method can only be called once. +func Start(inputChan, sourceFinishedChan, sourceProducedChan chan *models.Item) error { + var done bool + + log.Start() + logger = log.NewFieldedLogger(&log.Fields{ + "component": "finisher", + }) + + once.Do(func() { + ctx, cancel := context.WithCancel(context.Background()) + globalFinisher = &finisher{ + ctx: ctx, + cancel: cancel, + inputCh: inputChan, + sourceFinishedCh: sourceFinishedChan, + sourceProducedCh: sourceProducedChan, + wg: sync.WaitGroup{}, + } + logger.Debug("initialized") + for i := 0; i < config.Get().WorkersCount; i++ { + globalFinisher.wg.Add(1) + go globalFinisher.worker(strconv.Itoa(i)) + } + logger.Info("started") + done = true + }) + + if !done { + return ErrFinisherAlreadyInitialized + } + + return nil +} + +// Stop stops the global finisher. +func Stop() { + if globalFinisher != nil { + logger.Debug("received stop signal") + globalFinisher.cancel() + globalFinisher.wg.Wait() + globalFinisher = nil + once = sync.Once{} + logger.Info("stopped") + } +} + +func (f *finisher) worker(workerID string) { + defer f.wg.Done() + logger := log.NewFieldedLogger(&log.Fields{ + "component": "finisher.worker", + "worker_id": workerID, + }) + + controlChans := pause.Subscribe() + defer pause.Unsubscribe(controlChans) + + for { + select { + case <-f.ctx.Done(): + logger.Debug("shutting down") + return + case <-controlChans.PauseCh: + logger.Debug("received pause event") + controlChans.ResumeCh <- struct{}{} + logger.Debug("received resume event") + case seed, ok := <-f.inputCh: + if ok { + if seed == nil { + panic("received nil seed") + } + + if !seed.IsSeed() { + panic("received non-seed item") + } + + logger.Debug("received seed", "seed", seed.GetShortID()) + + if err := seed.CheckConsistency(); err != nil { + panic(fmt.Sprintf("seed consistency check failed with err: %s, seed id %s, worker id %s", err.Error(), seed.GetShortID(), workerID)) + } + + // If the seed is fresh, send it to the source + if seed.GetStatus() == models.ItemFresh { + logger.Debug("fresh seed received", "seed", seed) + f.sourceProducedCh <- seed + continue + } + + // If the seed has fresh children, send it to feedback + isComplete := seed.CompleteAndCheck() + if !isComplete { + logger.Debug("seed has fresh children", "seed", seed.GetShortID()) + err := reactor.ReceiveFeedback(seed) + if err != nil && err != reactor.ErrReactorFrozen { + panic(err) + } + continue + } + + // If the seed has no fresh redirection or children, mark it as finished + logger.Debug("seed has no fresh redirection or children", "seed", seed.GetShortID()) + err := reactor.MarkAsFinished(seed) + if err != nil { + panic(err) + } + + // Notify the source that the seed has been finished + // E.g.: to delete the seed in Crawl HQ + if f.sourceFinishedCh != nil { + f.sourceFinishedCh <- seed + } + + stats.SeedsFinishedIncr() + logger.Debug("seed finished", "seed", seed.GetShortID()) + } + } + } +} diff --git a/internal/pkg/log/config.go b/internal/pkg/log/config.go new file mode 100644 index 00000000..6033fbd6 --- /dev/null +++ b/internal/pkg/log/config.go @@ -0,0 +1,180 @@ +// config.go +package log + +import ( + "context" + "fmt" + "log/slog" + "os" + "strings" + "time" + + "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/internetarchive/Zeno/internal/pkg/log/ringbuffer" + slogmulti "github.com/samber/slog-multi" +) + +var ( + rotatedLogFile *rotatedFile +) + +type logConfig struct { + FileConfig *logfileConfig + StdoutEnabled bool + StdoutLevel slog.Level + StderrEnabled bool + StderrLevel slog.Level + ElasticsearchConfig *elasticsearchConfig + LogTUI bool + TUILogLevel slog.Level +} + +type logfileConfig struct { + Dir string + Prefix string + Level slog.Level + Rotate bool + RotatePeriod time.Duration +} + +type elasticsearchConfig struct { + Addresses string + Username string + Password string + IndexPrefix string + Level slog.Level + Rotate bool + RotatePeriod time.Duration +} + +// makeConfig returns the default configuration +func makeConfig() *logConfig { + if config.Get() == nil { + return &logConfig{ + FileConfig: nil, + StdoutEnabled: true, + StdoutLevel: slog.LevelInfo, + StderrEnabled: true, + StderrLevel: slog.LevelError, + ElasticsearchConfig: nil, + LogTUI: false, + } + } + + fileRotatePeriod, err := time.ParseDuration(config.Get().LogFileRotation) + if err != nil && config.Get().LogFileRotation != "" { + fileRotatePeriod = 1 * time.Hour + } + + elasticRotatePeriod, err := time.ParseDuration(config.Get().ElasticSearchRotation) + if err != nil && config.Get().ElasticSearchRotation != "" { + elasticRotatePeriod = 24 * time.Hour + } + + var logFileOutputDir string + if logFileOutputDir = config.Get().LogFileOutputDir; logFileOutputDir == "" { + logFileOutputDir = fmt.Sprintf("%s/logs", config.Get().JobPath) + } + + var logFileConfig *logfileConfig + if !config.Get().NoFileLogging { + logFileConfig = &logfileConfig{ + Dir: logFileOutputDir, + Prefix: config.Get().LogFilePrefix, + Level: parseLevel(config.Get().LogFileLevel), + Rotate: config.Get().LogFileRotation != "", + RotatePeriod: fileRotatePeriod, + } + } else { + logFileConfig = nil + } + + var elasticConfig *elasticsearchConfig + if config.Get().ElasticSearchURLs != "" { + elasticConfig = &elasticsearchConfig{ + Addresses: config.Get().ElasticSearchURLs, + Username: config.Get().ElasticSearchUsername, + Password: config.Get().ElasticSearchPassword, + IndexPrefix: config.Get().ElasticSearchIndexPrefix, + Level: parseLevel(config.Get().ElasticSearchLogLevel), + Rotate: config.Get().ElasticSearchRotation != "", + RotatePeriod: elasticRotatePeriod, + } + } else { + elasticConfig = nil + } + + return &logConfig{ + FileConfig: logFileConfig, + ElasticsearchConfig: elasticConfig, + StdoutEnabled: !config.Get().NoStdoutLogging, + StdoutLevel: parseLevel(config.Get().StdoutLogLevel), + StderrEnabled: !config.Get().NoStderrLogging, + StderrLevel: slog.LevelError, + LogTUI: config.Get().TUI, + TUILogLevel: parseLevel(config.Get().TUILogLevel), + } +} + +func parseLevel(level string) slog.Level { + lowercaseLevel := strings.ToLower(level) + switch lowercaseLevel { + case "debug": + return slog.LevelDebug + case "info": + return slog.LevelInfo + case "warn": + return slog.LevelWarn + case "error": + return slog.LevelError + default: + return slog.LevelInfo + } +} + +func (c *logConfig) makeMultiLogger() *slog.Logger { + baseRouter := slogmulti.Router() + + // Handle stdout/stderr logging configuration + // If Stdout and Stderr are both enabled we log every level below stderr level to stdout and the rest (above) to stderr + if c.StdoutEnabled && c.StderrEnabled { + stderrHandler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: c.StderrLevel}) + baseRouter = baseRouter.Add(stderrHandler, func(_ context.Context, r slog.Record) bool { + return r.Level >= c.StderrLevel + }) + + stdoutHandler := slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{Level: c.StdoutLevel}) + baseRouter = baseRouter.Add(stdoutHandler, func(_ context.Context, r slog.Record) bool { + return r.Level >= c.StdoutLevel && r.Level < c.StderrLevel + }) + } else if c.StdoutEnabled { + stdoutHandler := slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{Level: c.StdoutLevel}) + baseRouter = baseRouter.Add(stdoutHandler, func(_ context.Context, r slog.Record) bool { + return r.Level >= c.StdoutLevel + }) + } + + // Handle file logging configuration + if c.FileConfig != nil { + rotatedLogFile = newRotatedFile(c.FileConfig) + fileHandler := slog.NewTextHandler(rotatedLogFile, &slog.HandlerOptions{Level: c.FileConfig.Level}) + baseRouter = baseRouter.Add(fileHandler, func(_ context.Context, r slog.Record) bool { + return r.Level >= c.FileConfig.Level + }) + } + + // Handle TUI logging configuration + if c.LogTUI { + TUIRingBuffer = ringbuffer.NewMP1COverwritingRingBuffer[string](16384) + rbWriter := ringbuffer.NewWriter(TUIRingBuffer) + rbHandler := slog.NewTextHandler(rbWriter, &slog.HandlerOptions{Level: c.TUILogLevel}) + baseRouter = baseRouter.Add(rbHandler, func(_ context.Context, r slog.Record) bool { + return r.Level >= c.TUILogLevel + }) + } + + // Handle Elasticsearch logging configuration + // TODO + + return slog.New(baseRouter.Handler()) +} diff --git a/internal/pkg/log/dumper/dumper.go b/internal/pkg/log/dumper/dumper.go new file mode 100644 index 00000000..bf255d84 --- /dev/null +++ b/internal/pkg/log/dumper/dumper.go @@ -0,0 +1,47 @@ +package dumper + +import ( + "fmt" + "os" + "time" + + "github.com/davecgh/go-spew/spew" + "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/internetarchive/Zeno/internal/pkg/log" + "github.com/internetarchive/Zeno/internal/pkg/reactor" + "github.com/internetarchive/Zeno/pkg/models" +) + +// Dump writes a spew dump of the items and an ASCII pretty print of the items to a dump file then returns the path to the dump file. +func Dump(items ...*models.Item) string { + // Creates a dump file to be written to by the dumper + var dumpFilePath string + if dumpFilePath = config.Get().LogFileOutputDir; dumpFilePath == "" { + dumpFilePath = fmt.Sprintf("%s/logs/dump-%s", config.Get().JobPath, time.Now().Format(time.RFC3339)) + } else { + dumpFilePath = fmt.Sprintf("%s/dump-%s", dumpFilePath, time.Now().Format(time.RFC3339)) + } + dumpFile, err := os.Create(dumpFilePath) + if err != nil { + log.Error("failed to create dump file: %v", err) + } + defer dumpFile.Close() + + if len(items) == 0 { + items = reactor.GetStateTableItems() + } + + for i := range items { + fmt.Fprintf(dumpFile, "Item: %s\n", items[i].GetID()) + spew.Fdump(dumpFile, items[i]) + fmt.Fprintf(dumpFile, "\n%s\n_______________________________", items[i].DrawTreeWithStatus()) + } + + return dumpFilePath +} + +// PanicWithDump writes a spew dump of the items and an ASCII pretty print of the items to a dump file then panics with a message. +func PanicWithDump(message string, items ...*models.Item) { + dumpFilePath := Dump(items...) + panic(fmt.Sprintf("with item dump file: %s AND message: %s", dumpFilePath, message)) +} diff --git a/internal/pkg/log/error.go b/internal/pkg/log/error.go new file mode 100644 index 00000000..dc99de15 --- /dev/null +++ b/internal/pkg/log/error.go @@ -0,0 +1,8 @@ +package log + +import "errors" + +var ( + // ErrLoggerAlreadyInitialized is the error returned when the logger is already initialized + ErrLoggerAlreadyInitialized = errors.New("logger already initialized") +) diff --git a/internal/pkg/log/fielded_logger.go b/internal/pkg/log/fielded_logger.go new file mode 100644 index 00000000..679c997f --- /dev/null +++ b/internal/pkg/log/fielded_logger.go @@ -0,0 +1,62 @@ +package log + +import ( + "context" + "log/slog" +) + +// Field defines an interface for fields +type Fields map[string]interface{} + +// FieldedLogger allows adding predefined fields to log entries +type FieldedLogger struct { + fields *Fields +} + +// NewFieldedLogger creates a new FieldedLogger with the given fields +func NewFieldedLogger(args *Fields) *FieldedLogger { + return &FieldedLogger{ + fields: args, + } +} + +// Debug logs a message at the debug level with the predefined fields +func (fl *FieldedLogger) Debug(msg string, args ...any) { + fl.logWithLevel(slog.LevelDebug, msg, args...) +} + +// Info logs a message at the info level with the predefined fields +func (fl *FieldedLogger) Info(msg string, args ...any) { + fl.logWithLevel(slog.LevelInfo, msg, args...) +} + +// Warn logs a message at the warn level with the predefined fields +func (fl *FieldedLogger) Warn(msg string, args ...any) { + fl.logWithLevel(slog.LevelWarn, msg, args...) +} + +// Error logs a message at the error level with the predefined fields +func (fl *FieldedLogger) Error(msg string, args ...any) { + fl.logWithLevel(slog.LevelError, msg, args...) +} + +func (fl *FieldedLogger) logWithLevel(level slog.Level, msg string, args ...any) { + var combinedArgs []any + + if fl.fields != nil { + for k, v := range *fl.fields { + combinedArgs = append(combinedArgs, k) + combinedArgs = append(combinedArgs, v) + } + } + + if len(args) > 0 { + for _, arg := range args { + combinedArgs = append(combinedArgs, arg) + } + } + + if multiLogger != nil { + multiLogger.Log(context.TODO(), level, msg, combinedArgs...) + } +} diff --git a/internal/pkg/log/log.go b/internal/pkg/log/log.go index e5b56b61..ed08dc60 100644 --- a/internal/pkg/log/log.go +++ b/internal/pkg/log/log.go @@ -1,262 +1,74 @@ -// Package log provides a custom logging solution with multi-output support -// and log rotation for file output. -// ----------------------------------------------------------------------------- -// When Logger.{Debug, Info, Warn, Error, Fatal} is called, the log message is -// passed to all underlying handlers represented by Logger.handler -// Then multiHandler.Handle is called to pass the log message to all underlying handlers. -// ----------------------------------------------------------------------------- -// The rotation mechanism works by locking the logger, checking if it's time to rotate, -// and then calling the Rotate method on all rotatable handlers. +// log.go package log import ( - "context" - "fmt" "log/slog" - "os" - "path/filepath" "sync" - "sync/atomic" - "time" - "github.com/elastic/go-elasticsearch/v8" + "github.com/internetarchive/Zeno/internal/pkg/log/ringbuffer" ) +// Global variables var ( - isLoggerInit *atomic.Bool - storedLogger *Logger - once sync.Once -) - -// Logger wraps slog.Logger to provide multi-output functionality -type Logger struct { - sync.Mutex - handler *multiHandler - slogger *slog.Logger - stopRotation chan struct{} - stopErrorLog chan struct{} - errorChan chan error -} - -// Config holds the configuration for the logger -type Config struct { - FileConfig *LogfileConfig - FileLevel slog.Level - StdoutEnabled bool - StdoutLevel slog.Level - RotateLogFile bool - ElasticsearchConfig *ElasticsearchConfig - RotateElasticSearchIndex bool - isDefault bool -} - -// New creates a new Logger instance with the given configuration. -// It sets up handlers for stdout (text format) and file output (JSON format) if specified. -// If FileOutput is empty, only stdout logging will be enabled. -// Only the first call to New will store the logger to be reused. Subsequent calls will return a new logger instance. -// Only the first call to New will rotate the logs destinations. -// Please refrain from calling New multiple times in the same program. -// -// Parameters: -// - cfg: Config struct containing logger configuration options -// -// Returns: -// - *Logger: A new Logger instance -// - error: An error if there was a problem creating the logger (e.g., unable to open log file) -func New(cfg Config) (*Logger, error) { - var handlers []slog.Handler - - // Create stdout handler - if cfg.StdoutEnabled { - stdoutHandler := slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{ - Level: cfg.StdoutLevel, - }) - handlers = append(handlers, stdoutHandler) - } - - // Create file handler if FileOutput is specified - if cfg.FileConfig != nil { - // Create directories if they don't exist - err := os.MkdirAll(filepath.Dir(cfg.FileConfig.Filename()), 0755) - if err != nil { - return nil, err - } - - // Open log file - file, err := os.OpenFile(cfg.FileConfig.Filename(), os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) - if err != nil { - return nil, err - } - fileHandler := &fileHandler{ - Handler: slog.NewJSONHandler(file, &slog.HandlerOptions{Level: cfg.FileLevel}), - fileDescriptor: file, - rotationInterval: 6 * time.Hour, - lastRotation: time.Now(), - logfileConfig: cfg.FileConfig, - } - handlers = append(handlers, fileHandler) - } - - // Create Elasticsearch handler if ElasticsearchConfig is specified - if cfg.ElasticsearchConfig != nil { - esClient, err := elasticsearch.NewClient(elasticsearch.Config{ - Addresses: cfg.ElasticsearchConfig.Addresses, - Username: cfg.ElasticsearchConfig.Username, - Password: cfg.ElasticsearchConfig.Password, - }) - if err != nil { - return nil, fmt.Errorf("failed to create Elasticsearch client: %w", err) - } - esHandler := &ElasticsearchHandler{ - client: esClient, - index: fmt.Sprintf("%s-%s", cfg.ElasticsearchConfig.IndexPrefix, time.Now().Format("2006.01.02")), - level: cfg.ElasticsearchConfig.Level, - attrs: []slog.Attr{}, - groups: []string{}, - config: cfg.ElasticsearchConfig, - } - if err := esHandler.createIndex(); err != nil { - return nil, fmt.Errorf("failed to create Elasticsearch index: %w", err) - } - handlers = append(handlers, esHandler) - } - - // Create multi-handler - mh := &multiHandler{handlers: handlers} - - // Create slog.Logger - slogger := slog.New(mh) - - logger := &Logger{ - handler: mh, - slogger: slogger, - errorChan: make(chan error, 10), - stopErrorLog: make(chan struct{}), - } - - if !cfg.isDefault { - once.Do(func() { - isLoggerInit = new(atomic.Bool) - storedLogger = logger - isLoggerInit.CompareAndSwap(false, true) + once sync.Once + wg sync.WaitGroup + multiLogger *slog.Logger - // Start rotation goroutine - logger.startRotation() - }) - } + TUIRingBuffer *ringbuffer.MP1COverwritingRingBuffer[string] +) - return logger, nil -} +// Start initializes the logging package with the given configuration. +// If no configuration is provided, it uses the default configuration. +func Start() error { + var done = false -// DefaultOrStored returns the default Logger instance or if already initialized, the logger created by first call to New(). -// The default logger writes to both stdout (text format) and a file named "app.log" (JSON format). -// Both outputs are set to log messages at Info level and above. -// This function uses sync.Once to ensure that the default logger is only created once. -// -// Returns: -// - *Logger: The default Logger instance -// - bool: True if the logger was created by this function, false if the logger was already initialized -func DefaultOrStored() (*Logger, bool) { - var created = false once.Do(func() { - isLoggerInit = new(atomic.Bool) - logger, err := New(Config{ - FileConfig: &LogfileConfig{Dir: "jobs", Prefix: "zeno"}, - FileLevel: slog.LevelInfo, - StdoutLevel: slog.LevelInfo, - isDefault: true, - }) - if err != nil { - panic(err) - } - storedLogger = logger - created = isLoggerInit.CompareAndSwap(false, true) + config := makeConfig() + multiLogger = config.makeMultiLogger() + done = true }) - return storedLogger, created -} -// GetStoredLogger returns the logger created by the first call to New() or DefaultOrStored(). -// If the logger has not been initialized, it will return nil. -func GetStoredLogger() *Logger { - return storedLogger -} + if !done { + return ErrLoggerAlreadyInitialized + } -// Errors returns a channel that will receive logging errors -func (l *Logger) Errors() <-chan error { - return l.errorChan + return nil } -func (l *Logger) log(level slog.Level, msg string, args ...any) { - l.Lock() - defer l.Unlock() - - // Create a new Record with the message and args - r := slog.NewRecord(time.Now(), level, msg, 0) - r.Add(args...) - - err := l.handler.Handle(context.Background(), r) - if err != nil { - select { - case l.errorChan <- err: - default: - // If the error channel is full, log to stderr as a last resort - fmt.Fprintf(os.Stderr, "Logging error: %v\n", err) - } +// Stop gracefully shuts down the logging system +func Stop() { + if rotatedLogFile != nil { + rotatedLogFile.Close() } + wg.Wait() + multiLogger = nil + once = sync.Once{} } -// Debug logs a message at Debug level. -// The first argument is the message to log, and subsequent arguments are key-value pairs -// that will be included in the log entry. -// -// Parameters: -// - msg: The message to log -// - args: Optional key-value pairs to include in the log entry -func (l *Logger) Debug(msg string, args ...any) { - l.log(slog.LevelDebug, msg, args...) -} - -// Info logs a message at Info level. -// The first argument is the message to log, and subsequent arguments are key-value pairs -// that will be included in the log entry. -// -// Parameters: -// - msg: The message to log -// - args: Optional key-value pairs to include in the log entry -func (l *Logger) Info(msg string, args ...any) { - l.log(slog.LevelInfo, msg, args...) +// Debug logs a message at the debug level +func Debug(msg string, args ...any) { + if multiLogger != nil { + multiLogger.Debug(msg, args...) + } } -// Warn logs a message at Warn level. -// The first argument is the message to log, and subsequent arguments are key-value pairs -// that will be included in the log entry. -// -// Parameters: -// - msg: The message to log -// - args: Optional key-value pairs to include in the log entry -func (l *Logger) Warn(msg string, args ...any) { - l.log(slog.LevelWarn, msg, args...) +// Info logs a message at the info level +func Info(msg string, args ...any) { + if multiLogger != nil { + multiLogger.Info(msg, args...) + } } -// Error logs a message at Error level. -// The first argument is the message to log, and subsequent arguments are key-value pairs -// that will be included in the log entry. -// -// Parameters: -// - msg: The message to log -// - args: Optional key-value pairs to include in the log entry -func (l *Logger) Error(msg string, args ...any) { - l.log(slog.LevelError, msg, args...) +// Warn logs a message at the warn level +func Warn(msg string, args ...any) { + if multiLogger != nil { + multiLogger.Warn(msg, args...) + } } -// Fatal logs a message at Error level and then calls os.Exit(1). -// The first argument is the message to log, and subsequent arguments are key-value pairs -// that will be included in the log entry. -// -// Parameters: -// - msg: The message to log -// - args: Optional key-value pairs to include in the log entry -func (l *Logger) Fatal(msg string, args ...any) { - l.log(slog.LevelError, msg, args...) - os.Exit(1) +// Error logs a message at the error level +func Error(msg string, args ...any) { + if multiLogger != nil { + multiLogger.Error(msg, args...) + } } diff --git a/internal/pkg/log/ringbuffer/ringbuffer.go b/internal/pkg/log/ringbuffer/ringbuffer.go new file mode 100644 index 00000000..c83b45c4 --- /dev/null +++ b/internal/pkg/log/ringbuffer/ringbuffer.go @@ -0,0 +1,118 @@ +package ringbuffer + +import ( + "math/bits" + "sync/atomic" +) + +// MP1COverwritingRingBuffer is a multi-producer, single-consumer ring buffer +// with a fixed size that overwrites the oldest item when full. +type MP1COverwritingRingBuffer[T any] struct { + items []atomic.Value // ring storage of generic type T + size uint64 // always a power of two + mask uint64 + + // tail: total number of items "reserved" by producers so far + tail atomic.Uint64 + + // head: total number of items consumed so far (or forcibly advanced) + head atomic.Uint64 +} + +// NewMP1COverwritingRingBuffer creates a ring buffer with capacity at least 'capacity', +// rounding up to the next power of two, so the ring won't grow infinitely. +func NewMP1COverwritingRingBuffer[T any](capacity uint64) *MP1COverwritingRingBuffer[T] { + size := nextPowerOfTwo(capacity) + mask := size - 1 + + rb := &MP1COverwritingRingBuffer[T]{ + items: make([]atomic.Value, size), + size: size, + mask: mask, + } + rb.head.Store(0) + rb.tail.Store(0) + return rb +} + +// nextPowerOfTwo rounds n up to the nearest power of two. +// This ensures index calculations are fast (using & mask). +func nextPowerOfTwo(n uint64) uint64 { + if n < 2 { + return 2 + } + return 1 << (64 - bits.LeadingZeros64(n-1)) +} + +// Enqueue writes 'val' into the ring buffer, overwriting the oldest entry if necessary. +// It never fails or blocks; producers always succeed. +func (rb *MP1COverwritingRingBuffer[T]) Enqueue(val T) { + for { + oldTail := rb.tail.Load() + oldHead := rb.head.Load() + + // If we appear "full", forcibly advance head by 1 (overwriting oldest). + if oldTail-oldHead >= rb.size { + // Attempt to increment head by 1. + // If CAS fails, we retry the entire loop. + if !rb.head.CompareAndSwap(oldHead, oldHead+1) { + continue + } + } + + // Reserve the next slot via CAS on tail + if rb.tail.CompareAndSwap(oldTail, oldTail+1) { + // We have claimed index 'oldTail & rb.mask'. + idx := oldTail & rb.mask + rb.items[idx].Store(val) + return + } + // If CAS fails, another producer advanced tail first; retry. + } +} + +// DumpN reads up to 'maxCount' items in a single batch. +// Returns a slice of length <= maxCount. If empty, returns nil. +// +// This is a single-consumer operation; only one goroutine should call DumpN. +func (rb *MP1COverwritingRingBuffer[T]) DumpN(maxCount uint64) []T { + var zero T + + for { + oldHead := rb.head.Load() + oldTail := rb.tail.Load() + + // If buffer is empty + if oldHead == oldTail { + return nil + } + + // Number of items currently available + available := oldTail - oldHead + n := available + if n > maxCount { + n = maxCount + } + + // Copy out up to n items + out := make([]T, 0, n) + for i := uint64(0); i < n; i++ { + idx := (oldHead + i) & rb.mask + val := rb.items[idx].Load() + typedVal, _ := val.(T) + out = append(out, typedVal) + } + + // Try to consume all n items at once + if rb.head.CompareAndSwap(oldHead, oldHead+n) { + // (Optional) Zero out the consumed slots for GC or security reasons + for i := uint64(0); i < n; i++ { + idx := (oldHead + i) & rb.mask + rb.items[idx].Store(zero) + } + return out + } + // If CAS fails, a producer forcibly advanced head or there's a race; + // retry with updated values. + } +} diff --git a/internal/pkg/log/ringbuffer/ringbuffer_test.go b/internal/pkg/log/ringbuffer/ringbuffer_test.go new file mode 100644 index 00000000..64203459 --- /dev/null +++ b/internal/pkg/log/ringbuffer/ringbuffer_test.go @@ -0,0 +1,306 @@ +package ringbuffer + +import ( + "fmt" + "math/rand" + "sync" + "sync/atomic" + "testing" + "time" +) + +// TestBasic verifies basic enqueue/dump functionality under single-thread usage. +func TestBasic(t *testing.T) { + rb := NewMP1COverwritingRingBuffer[int](4) + + // Enqueue some items + rb.Enqueue(1) + rb.Enqueue(2) + rb.Enqueue(3) + + // Dump up to 2 + out := rb.DumpN(2) + if len(out) != 2 { + t.Errorf("DumpN(2) = %v items, want 2", len(out)) + } + if out[0] != 1 || out[1] != 2 { + t.Errorf("Dumped wrong values: got %v, want [1,2]", out) + } + + // Dump up to 2 again + out = rb.DumpN(2) + if len(out) != 1 { + t.Errorf("Expected only 1 item left, got %d", len(out)) + } + if out[0] != 3 { + t.Errorf("Got %v, want 3", out[0]) + } + + // Now buffer is empty + out = rb.DumpN(2) + if out != nil { + t.Errorf("Expected empty nil slice, got %v", out) + } +} + +// TestNextPowerOfTwo verifies the nextPowerOfTwo function. +func TestNextPowerOfTwo(t *testing.T) { + tests := []struct { + input uint64 + expected uint64 + }{ + {0, 2}, + {1, 2}, + {2, 2}, + {3, 4}, + {4, 4}, + {5, 8}, + {6, 8}, + {7, 8}, + {8, 8}, + {9, 16}, + {15, 16}, + {16, 16}, + {17, 32}, + {31, 32}, + {32, 32}, + {33, 64}, + {63, 64}, + {64, 64}, + {65, 128}, + {127, 128}, + {128, 128}, + {129, 256}, + {255, 256}, + {256, 256}, + {257, 512}, + {511, 512}, + {512, 512}, + {513, 1024}, + {1023, 1024}, + {1024, 1024}, + {1025, 2048}, + } + + for _, tt := range tests { + t.Run(fmt.Sprintf("nextPowerOfTwo(%d)", tt.input), func(t *testing.T) { + if got := nextPowerOfTwo(tt.input); got != tt.expected { + t.Errorf("nextPowerOfTwo(%d) = %d, want %d", tt.input, got, tt.expected) + } + }) + } +} + +// TestOverwrite verifies that old items are discarded if we exceed capacity. +func TestOverwrite(t *testing.T) { + rb := NewMP1COverwritingRingBuffer[int](4) + + // Fill the buffer of size 4 + rb.Enqueue(1) + rb.Enqueue(2) + rb.Enqueue(3) + rb.Enqueue(4) + + // Next enqueue should force overwrite + rb.Enqueue(5) + + // Now the oldest item (1) should have been discarded + // Let's dump up to 10 items + out := rb.DumpN(10) + // Expect items [2,3,4,5] + if len(out) != 4 { + t.Fatalf("Expected 4 items, got %d", len(out)) + } + want := []int{2, 3, 4, 5} + for i, v := range out { + if want[i] != v { + t.Errorf("Expected %d at index %d, got %d", want[i], i, v) + } + } +} + +// TestHighVolume simulates multiple producers generating ~100k entries/s total, +// while a single consumer drains them in batches (DumpN(100)) every ~100ms. +// +// Use `go test -race -v` to ensure data-race detection. +func TestHighVolume(t *testing.T) { + const ( + ringCapacity = 1 << 14 // 16384 slots + producerCount = 10 + totalPerProducer = 20000 // total logs to produce per producer + batchSize = 100 + consumerPeriod = 100 * time.Millisecond + ) + + rb := NewMP1COverwritingRingBuffer[int](ringCapacity) + + var wg sync.WaitGroup + start := time.Now() + + // Start multiple producers + wg.Add(producerCount) + for p := 0; p < producerCount; p++ { + go func(id int) { + defer wg.Done() + for i := 0; i < totalPerProducer; i++ { + // Here we do a simple Enqueue of an integer + msg := id*1_000_000 + i // encode producer id + index + rb.Enqueue(msg) + + // ~simulate 100k/s total + // If each producer tries to produce ~10k/s for 10 producers => 100k/s + // That is 1 item every 100 microseconds. We'll do a Sleep(50us..200us) + time.Sleep(time.Microsecond * time.Duration(rand.Intn(150)+50)) + } + }(p) + } + + // Single consumer + stopConsumer := make(chan struct{}) + consumeCount := int64(0) + + // We collect logs, but since it's overwriting, we won't see them all. + // We'll just track how many we've read so far to confirm progress. + go func() { + ticker := time.NewTicker(consumerPeriod) + defer ticker.Stop() + + for { + select { + case <-stopConsumer: + return + case <-ticker.C: + out := rb.DumpN(batchSize) + if len(out) > 0 { + atomic.AddInt64(&consumeCount, int64(len(out))) + // For demonstration, let's just print the count + t.Logf("Consumer got %d logs in this batch", len(out)) + } + } + } + }() + + // Wait for all producers to finish + wg.Wait() + // Signal consumer to stop + close(stopConsumer) + + took := time.Since(start) + totalProduced := int64(producerCount * totalPerProducer) + t.Logf("Producers done: produced %d messages in %v", totalProduced, took) + + finalConsumed := atomic.LoadInt64(&consumeCount) + t.Logf("Total logs consumed (best effort) = %d", finalConsumed) + + // Because overwriting is allowed, finalConsumed <= totalProduced, possibly much less + if finalConsumed <= 0 { + t.Errorf("Consumer apparently got 0 logs, that shouldn't happen!") + } +} + +// BenchmarkSampling measures how effectively the ring buffer "samples" logs +// under high production rates (~100k/s) for various parameter sets. +// +// Run with: go test -bench=BenchmarkSampling -benchtime=5s -cpu=1,2,4 -v +func BenchmarkSampling(b *testing.B) { + // We’ll try different ring sizes, batch sizes, and consumer intervals. + ringSizes := []uint64{4096, 16384, 65536} + batchSizes := []uint64{50, 100, 500} + consumerIntervals := []time.Duration{50 * time.Millisecond, 100 * time.Millisecond} + + // For the benchmark, we won't iterate with b.N in the usual sense. + // Instead, each sub-benchmark will run for a fixed time (e.g. 2s or 5s). + // We'll measure how many logs are produced vs. consumed in that time. + runDuration := 2 * time.Second + + for _, ringSize := range ringSizes { + for _, batchSize := range batchSizes { + for _, interval := range consumerIntervals { + name := fmt.Sprintf("Ring%d_Batch%d_Interval%v", ringSize, batchSize, interval) + b.Run(name, func(b *testing.B) { + // We only want to measure the scenario once per sub-benchmark, + // not repeated b.N times. So we do: + b.StopTimer() + // Setup + rb := NewMP1COverwritingRingBuffer[int](ringSize) + + // We'll use some concurrency to approximate ~100k logs/s total. + // Let’s define how many producers, each producing at ~some rate. + producerCount := 8 + // We'll measure how many logs were actually produced: + var producedCount int64 + // We'll measure how many logs the consumer read: + var consumedCount int64 + + // Start producers + var wg sync.WaitGroup + wg.Add(producerCount) + // We'll run producers for "runDuration" + producerStop := make(chan struct{}) + + for p := 0; p < producerCount; p++ { + go func(id int) { + defer wg.Done() + r := rand.New(rand.NewSource(time.Now().UnixNano())) + for { + select { + case <-producerStop: + return + default: + // Enqueue an integer (id+random) + val := id*1_000_000 + r.Intn(100000) + rb.Enqueue(val) + atomic.AddInt64(&producedCount, 1) + // Sleep ~10 microseconds => ~100k/s across 8 producers? + // (8 producers * (1 / 10us) = 80k/s, tune as needed) + time.Sleep(10 * time.Microsecond) + } + } + }(p) + } + + // Start consumer (single) + consumerStop := make(chan struct{}) + go func() { + ticker := time.NewTicker(interval) + defer ticker.Stop() + for { + select { + case <-consumerStop: + return + case <-ticker.C: + out := rb.DumpN(batchSize) + atomic.AddInt64(&consumedCount, int64(len(out))) + } + } + }() + + // Now we actually "run" the scenario + b.StartTimer() + time.Sleep(runDuration) + b.StopTimer() + + // Signal producers and consumer to stop + close(producerStop) + wg.Wait() + close(consumerStop) + + // final measurement + pCount := atomic.LoadInt64(&producedCount) + cCount := atomic.LoadInt64(&consumedCount) + + // In many benchmarks, we might do b.SetBytes(...) or b.ReportMetric(...). + // For a "sampling rate," let's do: + samplingRate := float64(cCount) / float64(pCount+1) // +1 avoid /0 + logsPerSecondConsumed := float64(cCount) / runDuration.Seconds() + + // Print or record the results + b.ReportAllocs() // show memory allocations + b.ReportMetric(float64(pCount)/runDuration.Seconds(), "produced_ops/s") + b.ReportMetric(logsPerSecondConsumed, "consumed_ops/s") + b.ReportMetric(samplingRate, "sampling_ratio") + }) + } + } + } +} diff --git a/internal/pkg/log/ringbuffer/writer.go b/internal/pkg/log/ringbuffer/writer.go new file mode 100644 index 00000000..c76033f7 --- /dev/null +++ b/internal/pkg/log/ringbuffer/writer.go @@ -0,0 +1,55 @@ +package ringbuffer + +import "bytes" + +// Writer implements io.Writer and writes complete log lines to a ring buffer. +// It accumulates partial writes until a newline is seen. +type Writer struct { + rb *MP1COverwritingRingBuffer[string] + buf []byte +} + +// NewWriter returns a new Writer backed by the given ring buffer. +func NewWriter(rb *MP1COverwritingRingBuffer[string]) *Writer { + return &Writer{ + rb: rb, + } +} + +// Write implements io.Writer. +// It scans the input for newline characters, enqueuing each complete log line into the ring buffer. +// Any bytes after the last newline remain buffered until the next Write. +func (w *Writer) Write(p []byte) (n int, err error) { + n = len(p) + // Append new bytes to our internal buffer. + w.buf = append(w.buf, p...) + + // Process any complete lines. + for { + // Find the index of the newline character. + idx := bytes.IndexByte(w.buf, '\n') + if idx < 0 { + // No newline found: leave any incomplete log line in the buffer. + break + } + // Extract a complete log line (not including the newline). + line := string(w.buf[:idx]) + // Enqueue the complete log line into the ring buffer. + w.rb.Enqueue(line) + // Remove the processed log line (and it's newline) from the buffer. + w.buf = w.buf[idx+1:] + } + return n, nil +} + +// Flush force writing any buffered data (even if incomplete). +// This is not part of io.Writer, but can be useful in some logging setups. +func (w *Writer) Flush() error { + if len(w.buf) > 0 { + // Optionally, decide how to handle an incomplete log line. + // Here we enqueue it as is. + w.rb.Enqueue(string(w.buf)) + w.buf = nil + } + return nil +} diff --git a/internal/pkg/log/ringbuffer/writer_test.go b/internal/pkg/log/ringbuffer/writer_test.go new file mode 100644 index 00000000..86b91f63 --- /dev/null +++ b/internal/pkg/log/ringbuffer/writer_test.go @@ -0,0 +1,179 @@ +package ringbuffer + +import ( + "log/slog" + "strings" + "testing" +) + +// TestSlogHandlerSingleLine tests that a simple log entry written via slog +// produces a complete log line in the ring buffer. +func TestSlogHandlerSingleLine(t *testing.T) { + rb := NewMP1COverwritingRingBuffer[string](10) + writer := NewWriter(rb) + handler := slog.NewTextHandler(writer, &slog.HandlerOptions{ + // For testing, you might disable timestamp, source, etc. + ReplaceAttr: func(groups []string, a slog.Attr) slog.Attr { + return a + }, + }) + logger := slog.New(handler) + + // Log a simple message. + logger.Info("test message", "key", "value") + + // In a typical use-case, the handler writes a complete log line + // (ending with a newline). But if for some reason the log output is + // buffered, we can call Flush to force any incomplete line. + writer.Flush() + + // Dump from the ring buffer. + entries := rb.DumpN(10) + if len(entries) != 1 { + t.Fatalf("expected 1 log line, got %d", len(entries)) + } + // The formatted log line should contain the message. + if !strings.Contains(entries[0], "test message") { + t.Errorf("expected log line to contain 'test message', got: %s", entries[0]) + } +} + +// TestMultipleLinesOneWrite tests that a single Write call containing multiple +// newlines produces multiple entries. +func TestMultipleLinesOneWrite(t *testing.T) { + rb := NewMP1COverwritingRingBuffer[string](10) + writer := NewWriter(rb) + + // Write a string that contains three complete lines. + input := "first line\nsecond line\nthird line\n" + n, err := writer.Write([]byte(input)) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if n != len(input) { + t.Fatalf("expected to write %d bytes, wrote %d", len(input), n) + } + writer.Flush() // not strictly necessary here since all lines end with \n + + entries := rb.DumpN(10) + if len(entries) != 3 { + t.Fatalf("expected 3 log lines, got %d", len(entries)) + } + + expected := []string{"first line", "second line", "third line"} + for i, exp := range expected { + if entries[i] != exp { + t.Errorf("line %d: expected %q, got %q", i, exp, entries[i]) + } + } +} + +// TestIncompleteLine tests that incomplete lines remain buffered until a newline +// is received. +func TestIncompleteLine(t *testing.T) { + rb := NewMP1COverwritingRingBuffer[string](10) + writer := NewWriter(rb) + + // Write an incomplete line (no newline yet). + writer.Write([]byte("incomplete")) + // Dumping now should return nil because no complete line is present. + if entries := rb.DumpN(10); entries != nil { + t.Fatalf("expected no complete log line, got: %v", entries) + } + + // Write the rest of the line. + writer.Write([]byte(" line\n")) + // Now the buffered content should yield a complete line. + entries := rb.DumpN(10) + if len(entries) != 1 { + t.Fatalf("expected 1 log line, got %d", len(entries)) + } + if entries[0] != "incomplete line" { + t.Errorf("expected log line %q, got %q", "incomplete line", entries[0]) + } +} + +// TestFlushIncomplete tests that calling Flush forces any incomplete log line +// into the ring buffer. +func TestFlushIncomplete(t *testing.T) { + rb := NewMP1COverwritingRingBuffer[string](10) + writer := NewWriter(rb) + + // Write an incomplete log line. + writer.Write([]byte("partial line")) + // Without a newline, DumpN should yield nil. + if entries := rb.DumpN(10); entries != nil { + t.Fatalf("expected no complete log line, got: %v", entries) + } + + // Flush the writer so that the incomplete line is enqueued. + writer.Flush() + entries := rb.DumpN(10) + if len(entries) != 1 { + t.Fatalf("expected 1 log line after flush, got %d", len(entries)) + } + if entries[0] != "partial line" { + t.Errorf("expected log line %q, got %q", "partial line", entries[0]) + } +} + +// TestMultipleWritesForSingleLine tests that a log line built over several Write +// calls is enqueued as one complete line. +func TestMultipleWritesForSingleLine(t *testing.T) { + rb := NewMP1COverwritingRingBuffer[string](10) + writer := NewWriter(rb) + + // Write parts of a line. + writer.Write([]byte("part1 ")) + writer.Write([]byte("part2")) + // At this point, no newline has been encountered. + if entries := rb.DumpN(10); entries != nil { + t.Errorf("expected no complete log line, got: %v", entries) + } + + // Write the newline to complete the log line. + writer.Write([]byte("\n")) + // Flush is optional here since a newline was written. + writer.Flush() + + entries := rb.DumpN(10) + if len(entries) != 1 { + t.Fatalf("expected 1 complete log line, got %d", len(entries)) + } + if entries[0] != "part1 part2" { + t.Errorf("expected log line %q, got %q", "part1 part2", entries[0]) + } +} + +// TestEdgeCases tests a couple of edge cases such as empty writes and lines starting with a newline. +func TestEdgeCases(t *testing.T) { + // Test empty write. + rb := NewMP1COverwritingRingBuffer[string](10) + writer := NewWriter(rb) + + n, err := writer.Write([]byte("")) + if err != nil { + t.Fatalf("unexpected error on empty write: %v", err) + } + if n != 0 { + t.Errorf("expected 0 bytes written on empty write, got %d", n) + } + if entries := rb.DumpN(10); entries != nil { + t.Errorf("expected no log entries after empty write, got: %v", entries) + } + + // Test a string that begins with a newline. + writer.Write([]byte("\nfirst line\n")) + writer.Flush() + entries := rb.DumpN(10) + if len(entries) != 2 { + t.Fatalf("expected 2 log lines, got %d", len(entries)) + } + // The first line should be empty (i.e. just "\n") and the second should be "first line\n". + if entries[0] != "" { + t.Errorf("expected first log line to be empty, got %q", entries[0]) + } + if entries[1] != "first line" { + t.Errorf("expected second log line to be \"first line\", got %q", entries[1]) + } +} diff --git a/internal/pkg/log/rotated_file.go b/internal/pkg/log/rotated_file.go new file mode 100644 index 00000000..9941e9a8 --- /dev/null +++ b/internal/pkg/log/rotated_file.go @@ -0,0 +1,92 @@ +package log + +import ( + "fmt" + "log" + "log/slog" + "os" + "sync" + "time" +) + +type rotatedFile struct { + level slog.Level + config *logfileConfig + file *os.File + mu sync.Mutex + ticker *time.Ticker + closeChan chan struct{} +} + +func newRotatedFile(config *logfileConfig) *rotatedFile { + rfile := &rotatedFile{ + config: config, + closeChan: make(chan struct{}), + } + + rfile.rotateFile() + if rfile.config.Rotate && rfile.config.RotatePeriod > 0 { + rfile.ticker = time.NewTicker(rfile.config.RotatePeriod) + wg.Add(1) + go rfile.rotationWorker() + } + + return rfile +} + +func (d *rotatedFile) Write(p []byte) (int, error) { + d.mu.Lock() + defer d.mu.Unlock() + if d.file == nil { + return 0, os.ErrClosed + } + return d.file.Write(p) +} + +func (d *rotatedFile) Close() { + if d.ticker != nil { + d.ticker.Stop() + } + close(d.closeChan) + d.mu.Lock() + if d.file != nil { + fmt.Fprintln(d.file, "Log file closed") + d.file.Close() + } + d.mu.Unlock() +} + +func (d *rotatedFile) rotateFile() { + d.mu.Lock() + defer d.mu.Unlock() + if d.file != nil { + d.file.Close() + } + + // Check if the directory exists, if not create it + if _, err := os.Stat(d.config.Dir); os.IsNotExist(err) { + err = os.MkdirAll(d.config.Dir, 0755) + if err != nil { + log.Fatalf("Failed to create log directory: %v", err) + } + } + + filename := fmt.Sprintf("%s/%s-%s.log", d.config.Dir, d.config.Prefix, time.Now().Format("2006.01.02T15-04")) + file, err := os.OpenFile(filename, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644) + if err != nil { + log.Fatalf("Failed to open log file: %v", err) + } + d.file = file +} + +func (d *rotatedFile) rotationWorker() { + defer wg.Done() + for { + select { + case <-d.ticker.C: + d.rotateFile() + case <-d.closeChan: + return + } + } +} diff --git a/internal/pkg/postprocessor/assets.go b/internal/pkg/postprocessor/assets.go new file mode 100644 index 00000000..6b28ad66 --- /dev/null +++ b/internal/pkg/postprocessor/assets.go @@ -0,0 +1,90 @@ +package postprocessor + +import ( + "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/internetarchive/Zeno/internal/pkg/log" + "github.com/internetarchive/Zeno/internal/pkg/postprocessor/extractor" + "github.com/internetarchive/Zeno/internal/pkg/postprocessor/sitespecific/ina" + "github.com/internetarchive/Zeno/internal/pkg/postprocessor/sitespecific/truthsocial" + "github.com/internetarchive/Zeno/pkg/models" +) + +// extractAssets extracts assets from the item's body and returns them. +// It also potentially returns outlinks if the body contains URLs that are not assets. +func extractAssets(item *models.Item) (assets, outlinks []*models.URL, err error) { + var ( + contentType = item.GetURL().GetResponse().Header.Get("Content-Type") + logger = log.NewFieldedLogger(&log.Fields{ + "component": "postprocessor.extractAssets", + }) + ) + + // Extract assets from the body using the appropriate extractor + switch { + // Order is important, we want to check for more specific things first, + // as they may trigger more general extractors (e.g. HTML) + case ina.IsAPIURL(item.GetURL()): + INAAssets, err := ina.ExtractMedias(item.GetURL()) + if err != nil { + logger.Error("unable to extract medias from INA", "err", err.Error(), "item", item.GetShortID()) + return assets, outlinks, err + } + + HTMLAssets, err := extractor.HTMLAssets(item) + if err != nil { + logger.Error("unable to extract assets", "err", err.Error(), "item", item.GetShortID()) + return assets, outlinks, err + } + + assets = append(INAAssets, HTMLAssets...) + case truthsocial.NeedExtraction(item.GetURL()): + assets, outlinks, err = truthsocial.ExtractAssets(item) + if err != nil { + logger.Error("unable to extract assets from TruthSocial", "err", err.Error(), "item", item.GetShortID()) + return assets, outlinks, err + } + case extractor.IsM3U8(item.GetURL()): + assets, err = extractor.M3U8(item.GetURL()) + if err != nil { + logger.Error("unable to extract assets", "err", err.Error(), "item", item.GetShortID()) + return assets, outlinks, err + } + case extractor.IsJSON(item.GetURL()): + assets, outlinks, err = extractor.JSON(item.GetURL()) + if err != nil { + logger.Error("unable to extract assets", "err", err.Error(), "item", item.GetShortID()) + return assets, outlinks, err + } + case extractor.IsXML(item.GetURL()): + assets, outlinks, err = extractor.XML(item.GetURL()) + if err != nil { + logger.Error("unable to extract assets", "err", err.Error(), "item", item.GetShortID()) + return assets, outlinks, err + } + case extractor.IsHTML(item.GetURL()): + assets, err = extractor.HTMLAssets(item) + if err != nil { + logger.Error("unable to extract assets", "err", err.Error(), "item", item.GetShortID()) + return assets, outlinks, err + } + default: + logger.Debug("no extractor used for page", "content-type", contentType, "item", item.GetShortID()) + return assets, outlinks, nil + } + + // For assets, set the hops level to the item's level + for _, asset := range assets { + asset.SetHops(item.GetURL().GetHops()) + } + + // For outlinks, set the hops level to the item's level + 1 + for _, outlink := range outlinks { + outlink.SetHops(item.GetURL().GetHops() + 1) + } + + return assets, outlinks, nil +} + +func shouldExtractAssets(item *models.Item) bool { + return !config.Get().DisableAssetsCapture && item.GetURL().GetBody() != nil +} diff --git a/internal/pkg/postprocessor/domainscrawl/domainscrawl.go b/internal/pkg/postprocessor/domainscrawl/domainscrawl.go new file mode 100644 index 00000000..24b7b641 --- /dev/null +++ b/internal/pkg/postprocessor/domainscrawl/domainscrawl.go @@ -0,0 +1,145 @@ +// Package domainscrawl is a postprocessing component that parse domains from a given input and stores them for later matching. +// It can store naive domains, full URLs, and regex patterns. It can then check if a given URL matches any of the stored patterns. +package domainscrawl + +import ( + "net/url" + "regexp" + "strings" + "sync" + + "github.com/ImVexed/fasturl" +) + +type matchEngine struct { + sync.RWMutex + enabled bool + regexes []*regexp.Regexp + domains []string + urls []url.URL +} + +var ( + globalMatcher = &matchEngine{ + enabled: false, + regexes: make([]*regexp.Regexp, 0), + domains: make([]string, 0), + urls: make([]url.URL, 0), + } +) + +// Reset the matcher to its initial state +func Reset() { + globalMatcher.Lock() + defer globalMatcher.Unlock() + + globalMatcher.enabled = false + globalMatcher.regexes = make([]*regexp.Regexp, 0) + globalMatcher.domains = make([]string, 0) + globalMatcher.urls = make([]url.URL, 0) +} + +// Enabled returns true if the domainscrawl matcher is enabled +func Enabled() bool { + globalMatcher.RLock() + defer globalMatcher.RUnlock() + + return globalMatcher.enabled +} + +// AddElements takes a slice of strings, heuristically determines their type, and stores them +func AddElements(elements []string) error { + globalMatcher.Lock() + defer globalMatcher.Unlock() + + if !globalMatcher.enabled { + globalMatcher.enabled = true + } + + for _, element := range elements { + // Try to parse as a URL first + parsedURL, err := url.Parse(element) + if err == nil && parsedURL.Scheme != "" && parsedURL.Host != "" { + // If it has a scheme and host, it's a full URL + globalMatcher.urls = append(globalMatcher.urls, *parsedURL) + continue + } + + // Check if it's a naive domain (e.g., "example.com") + if isNaiveDomain(element) { + globalMatcher.domains = append(globalMatcher.domains, element) + continue + } + + // Otherwise, assume it's a regex + re, err := regexp.Compile(element) + if err != nil { + return err + } + globalMatcher.regexes = append(globalMatcher.regexes, re) + } + return nil +} + +// Match checks if a given URL matches any of the stored patterns +func Match(rawURL string) bool { + u, err := fasturl.ParseURL(rawURL) + if err != nil { + return false + } + + globalMatcher.RLock() + defer globalMatcher.RUnlock() + + // Check against naive domains + for _, domain := range globalMatcher.domains { + if isSubdomainOrExactMatch(u.Host, domain) { + return true + } + } + + // Check against full URLs + for _, storedURL := range globalMatcher.urls { + if storedURL.String() == rawURL { + return true + } + // If the stored URL has no query, path, or fragment, we greedily match (sub)domain + if storedURL.RawQuery == "" && storedURL.Path == "" && storedURL.Fragment == "" && isSubdomainOrExactMatch(u.Host, storedURL.Host) { + return true + } + } + + // Check against regex patterns + for _, re := range globalMatcher.regexes { + if re.MatchString(rawURL) { + return true + } + } + + return false +} + +// Check if a string is a naive domain (e.g., "example.com") +func isNaiveDomain(s string) bool { + // A naive domain should not contain a scheme, path, or query + if strings.Contains(s, "://") || strings.Contains(s, "/") || strings.Contains(s, "?") || strings.Contains(s, "#") { + return false + } + // Check if it has a dot and no spaces + return strings.Contains(s, ".") && !strings.Contains(s, " ") +} + +// isSubdomainOrExactMatch checks if the given host is a subdomain or an exact match of the domain +func isSubdomainOrExactMatch(host, domain string) bool { + // Exact match + if host == domain { + return true + } + + // Subdomain match (e.g., "sub.example.com" matches "example.com") + if strings.HasSuffix(host, "."+domain) { + return true + } + + return false +} diff --git a/internal/pkg/postprocessor/domainscrawl/domainscrawl_test.go b/internal/pkg/postprocessor/domainscrawl/domainscrawl_test.go new file mode 100644 index 00000000..1993862b --- /dev/null +++ b/internal/pkg/postprocessor/domainscrawl/domainscrawl_test.go @@ -0,0 +1,273 @@ +package domainscrawl + +import ( + "testing" +) + +// Test isNaiveDomain function +func TestIsNaiveDomain(t *testing.T) { + tests := []struct { + input string + expected bool + }{ + {"example.com", true}, + {"sub.example.com", true}, + {"example.com/path", false}, + {"https://example.com", false}, + {"example.com?query=1", false}, + {"example.com#fragment", false}, + {"https://example.org/path?query=1", false}, + {"example", false}, // No dot + {"example com", false}, // Contains space + } + + for _, tt := range tests { + t.Run(tt.input, func(t *testing.T) { + result := isNaiveDomain(tt.input) + if result != tt.expected { + t.Errorf("isNaiveDomain(%q) = %v, expected %v", tt.input, result, tt.expected) + } + }) + } +} + +// Test isSubdomainOrExactMatch function +func TestIsSubdomainOrExactMatch(t *testing.T) { + tests := []struct { + host string + domain string + expected bool + }{ + {"example.com", "example.com", true}, // Exact match + {"sub.example.com", "example.com", true}, // Subdomain match + {"example.com", "sub.example.com", false}, // Not a subdomain + {"example.org", "example.com", false}, // Different domain + } + + for _, tt := range tests { + t.Run(tt.host+"_"+tt.domain, func(t *testing.T) { + result := isSubdomainOrExactMatch(tt.host, tt.domain) + if result != tt.expected { + t.Errorf("isSubdomainOrExactMatch(%q, %q) = %v, expected %v", tt.host, tt.domain, result, tt.expected) + } + }) + } +} + +// Test Enabled function +func TestEnabled(t *testing.T) { + Reset() + if Enabled() { + t.Error("Enabled() = true, expected false") + } + + err := AddElements([]string{"example.com"}) + if err != nil { + t.Fatalf("Failed to add elements: %v", err) + } + + if !Enabled() { + t.Error("Enabled() = false, expected true") + } +} + +// Test AddElements function +func TestAddElements(t *testing.T) { + tests := []struct { + name string + elements []string + expectErr bool + expectNaiveDomains []string + expectURLs []string + expectRegexes []string + }{ + { + name: "Valid naive domain", + elements: []string{"example.com"}, + expectErr: false, + expectNaiveDomains: []string{"example.com"}, + expectURLs: nil, + expectRegexes: nil, + }, + { + name: "Valid full URL", + elements: []string{"https://example.org/path?query=1"}, + expectErr: false, + expectNaiveDomains: nil, + expectURLs: []string{"https://example.org/path?query=1"}, + expectRegexes: nil, + }, + { + name: "Valid regex", + elements: []string{`^https?://(www\.)?example\.net/.*`}, + expectErr: false, + expectURLs: nil, + expectRegexes: []string{`^https?://(www\.)?example\.net/.*`}, + expectNaiveDomains: nil, + }, + { + name: "Invalid regex", + elements: []string{`[invalid`}, + expectErr: true, + expectURLs: nil, + expectRegexes: nil, + expectNaiveDomains: nil, + }, + { + name: "Mixed valid and invalid", + elements: []string{"example.com", `[invalid`}, + expectErr: true, + expectURLs: nil, + expectRegexes: nil, + expectNaiveDomains: []string{"example.com"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + Reset() + err := AddElements(tt.elements) + if (err != nil) != tt.expectErr { + t.Errorf("AddElements() error = %v, expectErr = %v", err, tt.expectErr) + } + + // Check naive domains + if len(tt.expectNaiveDomains) != len(globalMatcher.domains) { + t.Errorf("len(globalMatcher.domains) = %d, expected %d", len(globalMatcher.domains), len(tt.expectNaiveDomains)) + } else { + for i, domain := range tt.expectNaiveDomains { + if globalMatcher.domains[i] != domain { + t.Errorf("globalMatcher.domains[%d] = %q, expected %q", i, globalMatcher.domains[i], domain) + } + } + } + + // Check URLs + if len(tt.expectURLs) != len(globalMatcher.urls) { + t.Errorf("len(globalMatcher.urls) = %d, expected %d", len(globalMatcher.urls), len(tt.expectURLs)) + } else { + for i, url := range tt.expectURLs { + if globalMatcher.urls[i].String() != url { + t.Errorf("globalMatcher.urls[%d] = %q, expected %q", i, globalMatcher.urls[i].String(), url) + } + } + } + + // Check regexes + if len(tt.expectRegexes) != len(globalMatcher.regexes) { + t.Errorf("len(globalMatcher.regexes) = %d, expected %d", len(globalMatcher.regexes), len(tt.expectRegexes)) + } else { + for i, re := range tt.expectRegexes { + if globalMatcher.regexes[i].String() != re { + t.Errorf("globalMatcher.regexes[%d] = %q, expected %q", i, globalMatcher.regexes[i].String(), re) + } + } + } + }) + } +} + +// Test Match function +func TestMatch(t *testing.T) { + tests := []struct { + name string + rawURL string + elements []string + expected bool + }{ + { + name: "Exact match for naive domain", + rawURL: "https://example.com", + elements: []string{"example.com"}, + expected: true, + }, + { + name: "Subdomain match for naive domain", + rawURL: "https://sub.example.com", + elements: []string{"example.com"}, + expected: true, + }, + { + name: "Exact match for full URL", + rawURL: "https://example.org/path?query=1", + elements: []string{"https://example.org/path?query=1"}, + expected: true, + }, + { + name: "No match for full URL", + rawURL: "https://example.org/path?query=completely-different", + elements: []string{"https://example.org/path?query=1"}, + expected: false, + }, + { + name: "Greedy match for naive domain", + rawURL: "https://example.org/path?query=1", + elements: []string{"example.org"}, + expected: true, + }, + { + name: "Greedy match for full URL", + rawURL: "https://example.org/path?query=1", + elements: []string{"https://example.org"}, + expected: true, + }, + { + name: "Regex match", + rawURL: "https://example.net/", + elements: []string{`^https?://(www\.)?example\.net/.*`}, + expected: true, + }, + { + name: "Regex match with different scheme", + rawURL: "http://www.example.net/resource", + elements: []string{`^https?://(www\.)?example\.net/.*`}, + expected: true, + }, + { + name: "No match for different domain", + rawURL: "https://different.com", + elements: []string{"example.com"}, + expected: false, + }, + { + name: "No match for different full URL", + rawURL: "https://example.com/path", + elements: []string{"https://another-example.com"}, + expected: false, + }, + { + name: "No match for different regex", + rawURL: "https://example.net/", + elements: []string{`^https?://(www\.)?example\.com/.*`}, + expected: false, + }, + { + name: "No match for different precise regex", + rawURL: "https://example.net/?query=1", + elements: []string{`^https?://(www\.)?example\.net/only-one-path$`}, + expected: false, + }, + { + name: "Invalid URL with valid naive domain", + rawURL: "%am-i-really-an-URL?", + elements: []string{"example.com"}, + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + Reset() + + err := AddElements(tt.elements) + if err != nil { + t.Fatalf("Failed to add elements: %v", err) + } + + result := Match(tt.rawURL) + if result != tt.expected { + t.Errorf("Match(%q) = %v, expected %v", tt.rawURL, result, tt.expected) + } + }) + } +} diff --git a/internal/pkg/postprocessor/error.go b/internal/pkg/postprocessor/error.go new file mode 100644 index 00000000..b9ce6de7 --- /dev/null +++ b/internal/pkg/postprocessor/error.go @@ -0,0 +1,8 @@ +package postprocessor + +import "errors" + +var ( + // ErrPostprocessorAlreadyInitialized is the error returned when the postprocessor is already initialized + ErrPostprocessorAlreadyInitialized = errors.New("postprocessor already initialized") +) diff --git a/internal/pkg/postprocessor/extractor/base.go b/internal/pkg/postprocessor/extractor/base.go new file mode 100644 index 00000000..910dafe5 --- /dev/null +++ b/internal/pkg/postprocessor/extractor/base.go @@ -0,0 +1,13 @@ +package extractor + +import ( + "github.com/PuerkitoBio/goquery" + "github.com/internetarchive/Zeno/pkg/models" +) + +func extractBaseTag(item *models.Item, doc *goquery.Document) { + base, exists := doc.Find("base").First().Attr("href") + if exists { + item.SetBase(base) + } +} diff --git a/internal/pkg/postprocessor/extractor/base_test.go b/internal/pkg/postprocessor/extractor/base_test.go new file mode 100644 index 00000000..38df1dc4 --- /dev/null +++ b/internal/pkg/postprocessor/extractor/base_test.go @@ -0,0 +1,37 @@ +package extractor + +import ( + "strings" + "testing" + + "github.com/internetarchive/Zeno/pkg/models" + "github.com/PuerkitoBio/goquery" +) + +func TestExtractBaseTag(t *testing.T) { + htmlString := ` + + + + Test Page + + +

First paragraph

+ + ` + doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlString)) + if err != nil { + t.Errorf("html doc loading failed %s", err) + } + + item := models.NewItem("test", &models.URL{ + Raw: "https://example.com/something/page.html", + }, "") + + extractBaseTag(item, doc) + + if item.GetBase() != "http://example.com/something/" { + t.Errorf("Cannot find html doc base.href") + } +} diff --git a/internal/pkg/postprocessor/extractor/html.go b/internal/pkg/postprocessor/extractor/html.go new file mode 100644 index 00000000..b781ad97 --- /dev/null +++ b/internal/pkg/postprocessor/extractor/html.go @@ -0,0 +1,358 @@ +package extractor + +import ( + "encoding/json" + "regexp" + "slices" + "strconv" + "strings" + + "github.com/PuerkitoBio/goquery" + "github.com/internetarchive/Zeno/internal/pkg/config" + "github.com/internetarchive/Zeno/internal/pkg/log" + "github.com/internetarchive/Zeno/internal/pkg/utils" + "github.com/internetarchive/Zeno/pkg/models" +) + +var ( + backgroundImageRegex = regexp.MustCompile(`(?:\(['"]?)(.*?)(?:['"]?\))`) + urlRegex = regexp.MustCompile(`(?m)url\((.*?)\)`) +) + +func IsHTML(URL *models.URL) bool { + return isContentType(URL.GetResponse().Header.Get("Content-Type"), "html") || strings.Contains(URL.GetMIMEType().String(), "html") +} + +func HTMLOutlinks(item *models.Item) (outlinks []*models.URL, err error) { + defer item.GetURL().RewindBody() + + logger := log.NewFieldedLogger(&log.Fields{ + "component": "postprocessor.extractor.HTMLOutlinks", + }) + + var rawOutlinks []string + + // Retrieve (potentially creates it) the document from the body + document, err := item.GetURL().GetDocument() + if err != nil { + return nil, err + } + + // Extract the base tag if it exists + extractBaseTag(item, document) + + // Match tags with href, data-href, data-src, data-srcset, data-lazy-src, data-srcset, src, srcset + if !slices.Contains(config.Get().DisableHTMLTag, "a") { + document.Find("a").Each(func(index int, i *goquery.Selection) { + for _, node := range i.Nodes { + for _, attr := range node.Attr { + link := attr.Val + rawOutlinks = append(rawOutlinks, link) + } + } + }) + } + + for _, rawOutlink := range rawOutlinks { + resolvedURL, err := resolveURL(rawOutlink, item) + if err != nil { + logger.Debug("unable to resolve URL", "error", err, "url", item.GetURL().String(), "item", item.GetShortID()) + } else if resolvedURL != "" { + outlinks = append(outlinks, &models.URL{ + Raw: resolvedURL, + }) + continue + } + + outlinks = append(outlinks, &models.URL{ + Raw: rawOutlink, + }) + } + + return outlinks, nil +} + +func HTMLAssets(item *models.Item) (assets []*models.URL, err error) { + logger := log.NewFieldedLogger(&log.Fields{ + "component": "postprocessor.extractor.HTMLAssets", + }) + + var rawAssets []string + + // Retrieve (potentially creates it) the document from the body + document, err := item.GetURL().GetDocument() + if err != nil { + return nil, err + } + + // Extract the base tag if it exists + extractBaseTag(item, document) + + // Get assets from JSON payloads in data-item values + // Check all elements style attributes for background-image & also data-preview + document.Find("[data-item], [style], [data-preview]").Each(func(index int, i *goquery.Selection) { + dataItem, exists := i.Attr("data-item") + if exists { + URLsFromJSON, _, err := GetURLsFromJSON(json.NewDecoder(strings.NewReader(dataItem))) + if err != nil { + logger.Debug("unable to extract URLs from JSON in data-item attribute", "err", err, "url", item.GetURL().String(), "item", item.GetShortID()) + } else { + rawAssets = append(rawAssets, URLsFromJSON...) + } + } + + style, exists := i.Attr("style") + if exists { + matches := backgroundImageRegex.FindAllStringSubmatch(style, -1) + + for match := range matches { + if len(matches[match]) > 0 { + matchFound := matches[match][1] + + // Don't extract CSS elements that aren't URLs + if strings.Contains(matchFound, "%") || + strings.HasPrefix(matchFound, "0.") || + strings.HasPrefix(matchFound, "--font") || + strings.HasPrefix(matchFound, "--size") || + strings.HasPrefix(matchFound, "--color") || + strings.HasPrefix(matchFound, "--shreddit") || + strings.HasPrefix(matchFound, "100vh") { + continue + } + + rawAssets = append(rawAssets, matchFound) + } + } + } + + dataPreview, exists := i.Attr("data-preview") + if exists { + if strings.HasPrefix(dataPreview, "http") { + rawAssets = append(rawAssets, dataPreview) + } + } + }) + + // Try to find assets in tags.. this is a bit funky + if !slices.Contains(config.Get().DisableHTMLTag, "a") { + var validAssetPath = []string{ + "static/", + "assets/", + "asset/", + "images/", + "image/", + "img/", + } + + var validAssetAttributes = []string{ + "href", + "data-href", + "data-src", + "data-srcset", + "data-lazy-src", + "data-srcset", + "src", + "srcset", + } + + document.Find("a").Each(func(index int, i *goquery.Selection) { + for _, attr := range validAssetAttributes { + link, exists := i.Attr(attr) + if exists { + if utils.StringContainsSliceElements(link, validAssetPath) { + rawAssets = append(rawAssets, link) + } + } + } + }) + } + + // Extract assets on the page (images, scripts, videos..) + if !slices.Contains(config.Get().DisableHTMLTag, "img") { + document.Find("img").Each(func(index int, i *goquery.Selection) { + link, exists := i.Attr("src") + if exists { + rawAssets = append(rawAssets, link) + } + + link, exists = i.Attr("data-src") + if exists { + rawAssets = append(rawAssets, link) + } + + link, exists = i.Attr("data-lazy-src") + if exists { + rawAssets = append(rawAssets, link) + } + + link, exists = i.Attr("data-srcset") + if exists { + links := strings.Split(link, ",") + for _, link := range links { + rawAssets = append(rawAssets, strings.Split(strings.TrimSpace(link), " ")[0]) + } + } + + link, exists = i.Attr("srcset") + if exists { + links := strings.Split(link, ",") + for _, link := range links { + rawAssets = append(rawAssets, strings.Split(strings.TrimSpace(link), " ")[0]) + } + } + }) + } + + var targetElements = []string{} + if !slices.Contains(config.Get().DisableHTMLTag, "video") { + targetElements = append(targetElements, "video[src]") + } + if !slices.Contains(config.Get().DisableHTMLTag, "audio") { + targetElements = append(targetElements, "audio[src]") + } + if len(targetElements) > 0 { + document.Find(strings.Join(targetElements, ", ")).Each(func(index int, i *goquery.Selection) { + if link, exists := i.Attr("src"); exists { + rawAssets = append(rawAssets, link) + } + }) + } + + if !slices.Contains(config.Get().DisableHTMLTag, "style") { + document.Find("style").Each(func(index int, i *goquery.Selection) { + matches := urlRegex.FindAllStringSubmatch(i.Text(), -1) + for match := range matches { + matchReplacement := matches[match][1] + matchReplacement = strings.Replace(matchReplacement, "'", "", -1) + matchReplacement = strings.Replace(matchReplacement, "\"", "", -1) + + // If the URL already has http (or https), we don't need add anything to it. + if !strings.Contains(matchReplacement, "http") { + matchReplacement = strings.Replace(matchReplacement, "//", "http://", -1) + } + + if strings.HasPrefix(matchReplacement, "#wp-") { + continue + } + + rawAssets = append(rawAssets, matchReplacement) + } + }) + } + + if !slices.Contains(config.Get().DisableHTMLTag, "script") { + document.Find("script").Each(func(index int, i *goquery.Selection) { + link, exists := i.Attr("src") + if exists { + rawAssets = append(rawAssets, link) + } + + scriptType, exists := i.Attr("type") + if exists { + if scriptType == "application/json" { + URLsFromJSON, _, err := GetURLsFromJSON(json.NewDecoder(strings.NewReader(i.Text()))) + if err != nil { + // TODO: maybe add back when https://github.com/internetarchive/Zeno/issues/147 is fixed + // c.Log.Debug("unable to extract URLs from JSON in script tag", "error", err, "url", URL) + } else { + rawAssets = append(rawAssets, URLsFromJSON...) + } + } + } + + // Apply regex on the script's HTML to extract potential assets + outerHTML, err := goquery.OuterHtml(i) + if err != nil { + logger.Debug("unable to extract outer HTML from script tag", "err", err, "url", item.GetURL().String(), "item", item.GetShortID()) + } else { + scriptLinks := utils.DedupeStrings(LinkRegexRelaxed.FindAllString(outerHTML, -1)) + for _, scriptLink := range scriptLinks { + if strings.HasPrefix(scriptLink, "http") { + // Escape URLs when unicode runes are present in the extracted URLs + scriptLink, err := strconv.Unquote(`"` + scriptLink + `"`) + if err != nil { + logger.Debug("unable to escape URL from JSON in script tag", "error", err, "url", item.GetURL().String(), "item", item.GetShortID()) + continue + } + rawAssets = append(rawAssets, scriptLink) + } + } + } + + // Some