Initial commit

KOLANICH · KOLANICH · commit 49076cb32070 · 2023-10-10T23:22:11.000+03:00
diff --git a/.ci/aptPackagesToInstall.txt b/.ci/aptPackagesToInstall.txt
@@ -0,0 +1 @@
+libblast
diff --git a/.ci/pythonPackagesToInstallFromGit.txt b/.ci/pythonPackagesToInstallFromGit.txt
@@ -0,0 +1 @@
+https://github.com/fileTestSuite/fileTestSuite.py
diff --git a/.editorconfig b/.editorconfig
@@ -0,0 +1,12 @@
+root = true
+
+[*]
+charset = utf-8
+indent_style = tab
+indent_size = 4
+insert_final_newline = true
+end_of_line = lf
+
+[*.{yml,yaml}]
+indent_style = space
+indent_size = 2
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,2 @@
+*.imploded filter=lfs diff=lfs merge=lfs -text
+*.decomp filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/.templateMarker b/.github/.templateMarker
@@ -0,0 +1 @@
+KOLANICH/python_project_boilerplate.py
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,8 @@
+version: 2
+updates:
+  - package-ecosystem: "pip"
+    directory: "/"
+    schedule:
+      interval: "daily"
+    allow:
+      - dependency-type: "all"
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -0,0 +1,15 @@
+name: CI
+on:
+  push:
+    branches: [master]
+  pull_request:
+    branches: [master]
+
+jobs:
+  build:
+    runs-on: ubuntu-22.04
+    steps:
+      - name: typical python workflow
+        uses: KOLANICH-GHActions/typical-python-workflow@master
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,11 @@
+__pycache__
+*.pyc
+*.pyo
+/*.egg-info
+*.srctrlbm
+*.srctrldb
+build
+dist
+.eggs
+monkeytype.sqlite3
+/.ipynb_checkpoints
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -0,0 +1,51 @@
+image: registry.gitlab.com/kolanich-subgroups/docker-images/fixed_python:latest
+
+variables:
+  DOCKER_DRIVER: overlay2
+  SAST_ANALYZER_IMAGE_TAG: latest
+  SAST_DISABLE_DIND: "true"
+  SAST_CONFIDENCE_LEVEL: 5
+  CODECLIMATE_VERSION: latest
+
+include:
+  - template: SAST.gitlab-ci.yml
+  - template: Code-Quality.gitlab-ci.yml
+  - template: License-Management.gitlab-ci.yml
+
+build:
+  tags:
+    - shared
+    - linux
+  stage: build
+  variables:
+    GIT_DEPTH: "1"
+    PYTHONUSERBASE: ${CI_PROJECT_DIR}/python_user_packages
+
+  before_script:
+    - export PATH="$PATH:$PYTHONUSERBASE/bin" # don't move into `variables`
+    - apt-get update
+    # todo:
+    #- apt-get -y install 
+    #- pip3 install --upgrade 
+    #- python3 ./fix_python_modules_paths.py
+
+  script:
+    - python3 -m build -nw bdist_wheel
+    - mv ./dist/*.whl ./dist/pkblast-0.CI-py3-none-any.whl
+    - pip3 install --upgrade ./dist/*.whl
+    - coverage run --source=pkblast -m --branch pytest --junitxml=./rspec.xml ./tests/test.py
+    - coverage report -m
+    - coverage xml
+
+  coverage: /^TOTAL(?:\s+\d+){4}\s+(\d+%).+/
+
+  cache:
+    paths:
+      - $PYTHONUSERBASE
+
+  artifacts:
+    paths:
+      - dist
+    reports:
+      junit: ./rspec.xml
+      cobertura: ./coverage.xml
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,5 @@
+[submodule "tests/testDataset"]
+	path = tests/testDataset
+	url = https://github.com/implode-compression-impls/implode_test_files
+	branch = merged
+	shallow = true
diff --git a/Code_Of_Conduct.md b/Code_Of_Conduct.md
@@ -0,0 +1 @@
+No codes of conduct!
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,4 @@
+include UNLICENSE
+include *.md
+include tests
+include .editorconfig
diff --git a/ReadMe.md b/ReadMe.md
@@ -0,0 +1,43 @@
+pkblast.py [![Unlicensed work](https://raw.githubusercontent.com/unlicense/unlicense.org/master/static/favicon.png)](https://unlicense.org/)
+==========
+~~[wheel (GitLab)](https://gitlab.com/KOLANICH/pkblast.py/-/jobs/artifacts/master/raw/dist/pkblast-0.CI-py3-none-any.whl?job=build)~~
+~~[wheel (GHA via `nightly.link`)](https://nightly.link/implode-compression-impls/pkblast.py/workflows/CI/master/pkblast-0.CI-py3-none-any.whl)~~
+~~![GitLab Build Status](https://gitlab.com/KOLANICH/pkblast.py/badges/master/pipeline.svg)~~
+~~![GitLab Coverage](https://gitlab.com/KOLANICH/pkblast.py/badges/master/coverage.svg)~~
+~~[![GitHub Actions](https://github.com/implode-compression-impls/pkblast.py/workflows/CI/badge.svg)](https://github.com/implode-compression-impls/pkblast.py/actions/)~~
+[![Libraries.io Status](https://img.shields.io/librariesio/github/implode-compression-impls/pkblast.py.svg)](https://libraries.io/github/implode-compression-impls/pkblast.py)
+[![Code style: antiflash](https://img.shields.io/badge/code%20style-antiflash-FFF.svg)](https://codeberg.org/KOLANICH-tools/antiflash.py)
+
+This are free and Open-Source ctypes-based bindings to [`libblast`](https://github.com/madler/zlib/tree/master/contrib/blast) by [Mark @madler Adler](https://github.com/madler), which is a Free Open-Source implementation of a decompressor of PKWare Data Compression Library (DCL) compression format.
+
+For compression you need [`pkimplode.py`](https://codeberg.org/implode-compression-impls/pkimplode.py) a separate wrapper to a separate library by another author.
+
+For decompression you can alternatively use [`pwexplode`](https://github.com/Schallaven/pwexplode) a pure-python impl, but it is licensed under GPL-3.0-or-later.
+
+Benefits of CTypes-based impl:
+
+* Supports python versions other than CPython
+* No need to recompile python module after python version upgrade
+
+Drawbacks:
+* performance and overhead may be worse, than in the case of a cext.
+
+Installation
+------------
+
+In order to make it work you need a package with `liblast` itself installed into your system using your distro package manager. If your distro doesn't provide one, you can build it yourself using CMake CPack from the sources [by the link](https://codeberg.org/implode-compression-impls/libblast). You will get 3 packages, one with the headers, another one with the shared library, and yet another one with the CLI tool. Only the one with the lib is mandatory.
+
+Usage
+-----
+
+The package contains multiple functions. They have names matching the regular expression `^decompress(Stream|Bytes(Whole|Chunked))To(Stream|Bytes)$`.
+
+The first subgroup describes the type of input argument, the second subgroup describes the type of output.
+* If input is `Bytes`, then you need
+    * `Whole`, which means that the lib gots a pointer to whole array with compressed data. This is considered to be **the optimal input format**.
+    * `Chunked` (which means the data are processed in reality by `decompressStreamTo$3`) was created mainly for convenience of testing.
+* Otherwise it is an object acting like a stream. In this case you can also provide `chunkSize`, because streams are processed in chunks. Larger the chunk - less the count of chunks in the stream, so less overhead on calls of callbacks, but more memory is needed to store the chunk.
+
+The second subgroup describes the type of the result.
+* The internal type of the result is always a `Stream`. This is considered to be **the optimal output format**. It is because we don't know the size of output ahead of time, so have to use streams.
+* `Bytes` are only for your convenience and just wrap the `decompress$1ToStream` with a context with `BytesIO`.
diff --git a/UNLICENSE b/UNLICENSE
@@ -0,0 +1,24 @@
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <https://unlicense.org/>
diff --git a/pkblast/BlastError.py b/pkblast/BlastError.py
@@ -0,0 +1,14 @@
+from enum import IntEnum
+
+
+class BlastError(IntEnum):
+	success = 0
+
+	# If there is not enough input available or there is not enough output space, then a positive error is returned.
+	outputError = 1
+	inputExhausted = 2
+
+	# Errors in the source data
+	wrongLiteralFlag = -1  # literal flag not zero or one
+	wrongDictionary = -2  # dictionary size not in 4..6
+	distanceTooBig = -3  # distance is too far back
diff --git a/pkblast/__init__.py b/pkblast/__init__.py
@@ -0,0 +1,95 @@
+import typing
+from collections.abc import ByteString
+from io import BytesIO, IOBase
+from mmap import mmap
+from warnings import warn
+
+from .BlastError import BlastError
+from .ctypes import _decompressBytes, _decompressStream
+
+__all__ = ("decompressStreamToStream", "decompressStreamToBytes", "decompressBytesWholeToStream", "decompressBytesWholeToBytes", "decompressBytesChunkedToStream", "decompressBytesChunkedToBytes", "decompress", "DEFAULT_CHUNK_SIZE")
+
+DEFAULT_CHUNK_SIZE = 16384
+
+
+def decompressStreamToStream(inputStream: IOBase, outputStream: IOBase, chunkSize: int = DEFAULT_CHUNK_SIZE) -> (int, IOBase):
+	"""Used to do streaming decompression. The first arg is the stream to read from, the second ard is the stream to write to.
+	May be a memory map. `chunkSize` is the hint"""
+
+	errorCode, left = _decompressStream(inputStream, outputStream, chunkSize=chunkSize)
+
+	if errorCode:
+		raise Exception(BlastError(errorCode))
+
+	return left.value, outputStream
+
+
+def decompressStreamToBytes(inputStream: IOBase, chunkSize: int = DEFAULT_CHUNK_SIZE) -> (int, ByteString):
+	"""Decompresses `inputStream` into `outputStream`. Processes the whole data. You should use it instead of `decompressBytesChunkedToStream`, if it is possible."""
+	with BytesIO() as outputStream:
+		left, _ = decompressStreamToStream(inputStream, outputStream, chunkSize)
+		return left, outputStream.getvalue()
+
+
+def decompressBytesWholeToStream(compressed: ByteString, outputStream: IOBase) -> (int, IOBase):
+	"""Decompresses `compressed` into `outputStream`. Processes the whole data. You should use it instead of `decompressBytesChunkedToStream`, if it is possible."""
+	errorCode, left = _decompressBytes(compressed, outputStream)
+
+	if errorCode:
+		raise Exception(BlastError(errorCode))
+
+	return left.value, outputStream
+
+
+def decompressBytesWholeToBytes(compressed: ByteString) -> (int, ByteString):
+	"""Decompresses `compressed` and returns tuple (remaining, decompressed)`. Processes the whole data. You should use it instead of `decompressBytesChunkedToStream`, if it is possible."""
+	with BytesIO() as outputStream:
+		left, _ = decompressBytesWholeToStream(compressed, outputStream)
+		return left, outputStream.getvalue()
+
+
+def decompressBytesChunkedToStream(compressed: ByteString, outputStream: IOBase, chunkSize: int = DEFAULT_CHUNK_SIZE) -> (int, IOBase):
+	"""Decompresses `compressed` into `outputStream`. Processes the `compressed` the same way `decompressStreamToStream` does. In fact it is just a wrapper around it and `BytesIO`. Has bigger overhead than `decompressBytesWholeToStream` for the data that is already in memory: first it copies the data into `BytesIO`, then it allocates space for chunks, then it copies data from there into chunks, each copying has overhead of calling from C into python."""
+	_efficiencyDeprecationMessage(decompressBytesChunkedToStream, decompressBytesWholeToStream)
+	chunkSize = min(chunkSize, len(compressed))
+	with BytesIO(compressed) as inputStream:
+		return decompressStreamToStream(inputStream, outputStream, chunkSize=chunkSize)
+
+
+def decompressBytesChunkedToBytes(compressed: ByteString, chunkSize: int = DEFAULT_CHUNK_SIZE) -> (int, ByteString):
+	"""Decompresses `compressed` into `bytes`. Processes the `compressed` the same way `decompressStreamToStream` does. In fact it is just a wrapper around it and `BytesIO`. Has bigger overhead than `decompressBytesWholeToStream` for the data that is already in memory: first it copies the data into `BytesIO`, then it allocates space for chunks, then it copies data from there into chunks, each copying has overhead of calling from C into python."""
+	_efficiencyDeprecationMessage(decompressBytesChunkedToBytes, decompressBytesWholeToBytes)
+	with BytesIO() as outputStream:
+		left, _ = decompressBytesChunkedToStream(compressed, outputStream, chunkSize)
+		return left, outputStream.getvalue()
+
+
+def _efficiencyDeprecationMessage(calledFunc, func) -> None:
+	warn("It is inefficient to use `" + calledFunc.__name__ + "`. Use `" + func.__name__ + "` for this use case")
+
+
+_functionsUseCaseMapping = (
+	decompressStreamToStream,
+	decompressBytesWholeToStream,
+	decompressStreamToBytes,
+	decompressBytesWholeToBytes,
+)
+
+
+def decompress(compressed: typing.Union[ByteString, IOBase], outputStream: typing.Optional[IOBase] = None, chunkSize: int = DEFAULT_CHUNK_SIZE) -> (int, typing.Union[ByteString, IOBase]):
+	"""A convenience function. It is better to use the more specialized ones since they have less overhead. It decompresses `compressed` into `outputStream` and returns a tuple `(left, output)`.
+	`compressed` can be either a stream, or `bytes`-like stuff.
+	If `outputStream` is None, then it returns `bytes`. If `outputStream` is a stream, it writes into it.
+	`left` returned is the count of bytes in the array/stream that weren't processed."""
+
+	isOutputBytes = outputStream is None
+	isInputBytes = isinstance(compressed, (ByteString, mmap))
+	selector = isOutputBytes << 1 | int(isInputBytes)
+	func = _functionsUseCaseMapping[selector]
+	argz = [compressed]
+	if not isOutputBytes:
+		argz.append(outputStream)
+	if not isInputBytes:
+		argz.append(chunkSize)
+	_efficiencyDeprecationMessage(decompress, func)
+	return func(*argz)
diff --git a/pkblast/ctypes.py b/pkblast/ctypes.py
@@ -0,0 +1,84 @@
+import ctypes
+import platform
+from collections.abc import ByteString
+from functools import partial
+from mmap import mmap
+
+__all__ = ("_decompressBytes", "_decompressStream")
+
+#                                            how                   buf
+blast_in = ctypes.CFUNCTYPE(ctypes.c_uint32, ctypes.POINTER(None), ctypes.POINTER(ctypes.POINTER(ctypes.c_ubyte)))
+#                                            how                   buf                             len
+blast_out = ctypes.CFUNCTYPE(ctypes.c_int32, ctypes.POINTER(None), ctypes.POINTER(ctypes.c_ubyte), ctypes.c_uint32)
+
+
+lib = None
+
+
+def blast(infun: blast_in, inhow: ctypes.c_void_p, outfun: blast_out, outhow: ctypes.c_void_p, left: ctypes.POINTER(ctypes.c_uint32) = None, optionalInputArrayAndOutputLeftInfoPtr: ctypes.POINTER(ctypes.POINTER(ctypes.c_byte)) = None) -> ctypes.c_int:
+	"""If there is any unused input, *left is set to the number of bytes that were read and *in points to them.  Otherwise *left is set to zero and *in is set to NULL.  If left or in are NULL, then they are not set."""
+
+	return lib.blast(infun, inhow, outfun, outhow, left, optionalInputArrayAndOutputLeftInfoPtr)
+
+
+def _initLibrary():
+	if platform.system() == "Windows":
+		lib = ctypes.CDLL("libblast.dll")
+	else:
+		lib = ctypes.CDLL("libblast.so")
+
+	lib.blast.argtypes = [blast.__annotations__[argName] for argName in blast.__code__.co_varnames[: blast.__code__.co_argcount]]
+	lib.blast.restype = blast.__annotations__["return"]
+	return lib
+
+
+lib = _initLibrary()
+
+
+def outputCallback(outputStream, how, buf, l):
+	return outputStream.write(bytes(buf[:l])) != l
+
+
+def inputCallbackStream(inputStream, hold, holdPtr, how, buf):
+	countRead = inputStream.readinto(hold)
+	buf[0] = holdPtr
+	return countRead
+
+
+def _decompressStream(inputStream, outputStream, chunkSize: int = 16384) -> (int, int):
+	hold = (ctypes.c_byte * chunkSize).from_buffer(bytearray(chunkSize))
+	holdPtr = ctypes.cast(ctypes.pointer(hold), ctypes.POINTER(ctypes.c_ubyte))
+
+	left = ctypes.c_uint32(0)
+
+	return (
+		blast(
+			blast_in(partial(inputCallbackStream, inputStream, hold, holdPtr)), None,
+			blast_out(partial(outputCallback, outputStream)), None,
+			ctypes.byref(left), None
+		),
+		left
+	)
+
+
+def inputCallbackBytes(inputBytesPtr, l, how, buf):
+	buf[0] = inputBytesPtr
+	return l
+
+
+def _decompressBytes(inputBytes: ByteString, outputStream) -> (int, int):
+	if isinstance(inputBytes, bytes):
+		inputBytesC = ctypes.create_string_buffer(inputBytes)
+	else:
+		inputBytesC = (ctypes.c_byte * len(inputBytes)).from_buffer(inputBytes)
+	inputBytesPtr = ctypes.cast(ctypes.pointer(inputBytesC), ctypes.POINTER(ctypes.c_ubyte))
+	left = ctypes.c_uint32(0)
+
+	return (
+		blast(
+			blast_in(partial(inputCallbackBytes, inputBytesPtr, len(inputBytes))), None,
+			blast_out(partial(outputCallback, outputStream)), None,
+			ctypes.byref(left), None
+		),
+		left
+	)
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/tests/testDataset b/tests/testDataset
diff --git a/tests/tests.py b/tests/tests.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+https://github.com/fileTestSuite/fileTestSuite.py`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+*.imploded filter=lfs diff=lfs merge=lfs -text`
	`2`	`+*.decomp filter=lfs diff=lfs merge=lfs -text`