Skip to content

Commit a6fc4e1

Browse files
authored
modifies base docker image to use browsertrix 1.4.2 (#182)
* modifies base image to newest browsertrix version * modify browsertrix cmd args based on recent experience
1 parent d4fff0b commit a6fc4e1

File tree

2 files changed

+9
-6
lines changed

2 files changed

+9
-6
lines changed

Dockerfile

+7-5
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM webrecorder/browsertrix-crawler:1.0.4 AS base
1+
FROM webrecorder/browsertrix-crawler:1.4.2 AS base
22

33
ENV RUNNING_IN_DOCKER=1 \
44
LANG=C.UTF-8 \
@@ -29,21 +29,23 @@ ENV POETRY_NO_INTERACTION=1 \
2929
POETRY_VIRTUALENVS_CREATE=1
3030

3131

32-
RUN pip install --upgrade pip && \
33-
pip install "poetry>=2.0.0,<3.0.0"
32+
# Create a virtual environment for poetry and install it
33+
RUN python3 -m venv /poetry-venv && \
34+
/poetry-venv/bin/python -m pip install --upgrade pip && \
35+
/poetry-venv/bin/python -m pip install "poetry>=2.0.0,<3.0.0"
3436

3537
WORKDIR /app
3638

3739

3840
COPY pyproject.toml poetry.lock README.md ./
3941
# Copy dependency files and install dependencies (excluding the package itself)
40-
RUN poetry install --only main --no-root --no-cache
42+
RUN /poetry-venv/bin/poetry install --only main --no-root --no-cache
4143

4244

4345
# Copy code: This is needed for poetry to install the package itself,
4446
# but the environment should be cached from the previous step if toml and lock files haven't changed
4547
COPY ./src/ .
46-
RUN poetry install --only main --no-cache
48+
RUN /poetry-venv/bin/poetry install --only main --no-cache
4749

4850

4951
# Update PATH to include virtual environment binaries

src/auto_archiver/enrichers/wacz_enricher.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,8 @@ def enrich(self, to_enrich: Metadata) -> bool:
8484
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
8585
"--behaviorTimeout", str(self.timeout),
8686
"--timeout", str(self.timeout),
87-
"--blockAds" # TODO: test
87+
"--diskUtilization", "99",
88+
# "--blockAds" # note: this has been known to cause issues on cloudflare protected sites
8889
]
8990

9091
if self.docker_in_docker:

0 commit comments

Comments
 (0)