From 2347f3d6121ecd499b4aca2fde734553ba2f445c Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Sun, 23 Oct 2016 15:32:48 +0200 Subject: [PATCH 01/19] feat(enc): test with unicode tmpdir Borrowed from https://github.com/gitpython-developers/GitPython/pull/543 --- .travis.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.travis.yml b/.travis.yml index 464fafb..58aa658 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,8 +8,12 @@ python: - 3.3 - 3.4 - 3.5 +env: + # test with paths containing the unicode characters + - TMPDIR="/tmp/καλημέρα" install: - pip install coveralls + - ln -s /tmp "$TMPDIR" script: - ulimit -n 48 - ulimit -n From 883217bedd27dc5502a9760b8d734cbe11d39904 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Mon, 24 Oct 2016 04:34:09 +0200 Subject: [PATCH 02/19] fix(leaks): attempt to plug the leaks & filter dead regions + Possible fixing #31 by stop decrement on destruction, rely only on `with...` resources. + feat(mmap): utility to check if regions have been closed (PY3-only). --- smmap/mman.py | 5 +---- smmap/util.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/smmap/mman.py b/smmap/mman.py index 9df69ed..1c9603d 100644 --- a/smmap/mman.py +++ b/smmap/mman.py @@ -43,9 +43,6 @@ def __init__(self, manager=None, regions=None): self._ofs = 0 self._size = 0 - def __del__(self): - self._destroy() - def __enter__(self): return self @@ -401,7 +398,7 @@ def make_cursor(self, path_or_fd): **Note:** Using file descriptors directly is faster once new windows are mapped as it prevents the file to be opened again just for the purpose of mapping it.""" regions = self._fdict.get(path_or_fd) - if regions is None: + if not regions or not regions.scream_if_closed(): regions = self.MapRegionListCls(path_or_fd) self._fdict[path_or_fd] = regions # END obtain region for path diff --git a/smmap/util.py b/smmap/util.py index 02df41a..9abf1f5 100644 --- a/smmap/util.py +++ b/smmap/util.py @@ -23,7 +23,7 @@ # Python 3 has no `buffer`; only `memoryview` def buffer(obj, offset, size): # Actually, for gitpython this is fastest ... . - return memoryview(obj)[offset:offset+size] + return memoryview(obj)[offset:offset + size] # doing it directly is much faster ! # return obj[offset:offset + size] @@ -206,7 +206,7 @@ def client_count(self): """:return: number of clients currently using this region""" return self._uc - def increment_client_count(self, ofs = 1): + def increment_client_count(self, ofs=1): """Adjust the usage count by the given positive or negative offset. If usage count equals 0, we will auto-release our resources :return: True if we released resources, False otherwise. In the latter case, we can still be used""" @@ -273,4 +273,10 @@ def file_size(self): # END update file size return self._file_size + def scream_if_closed(self): + for r in self: + if getattr(r, '_mf.closed', None): # > `closed` attribute PY3.2+ + raise Exception('Found closed region: %s' % r._mf) + return self + #} END utility classes From 43c5f233477579eed0b9bd5eb8f0049388e1db82 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Mon, 24 Oct 2016 15:00:01 +0200 Subject: [PATCH 03/19] chore(ver): bump 2.0.1-->2.1.0.dev0 --- smmap/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smmap/__init__.py b/smmap/__init__.py index 9cfd0a1..7335971 100644 --- a/smmap/__init__.py +++ b/smmap/__init__.py @@ -3,7 +3,7 @@ __author__ = "Sebastian Thiel" __contact__ = "byronimo@gmail.com" __homepage__ = "https://github.com/Byron/smmap" -version_info = (2, 0, 1) +version_info = (2, 1, 0, 'dev0') __version__ = '.'.join(str(i) for i in version_info) # make everything available in root package for convenience From f10196f7974cc00e0e2f3e6b52262d66f7b99da1 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Mon, 24 Oct 2016 17:49:23 +0200 Subject: [PATCH 04/19] fix(regs): fix/rename scream_if_closed()-->collect_closed_regions() --- smmap/mman.py | 4 +++- smmap/util.py | 9 ++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/smmap/mman.py b/smmap/mman.py index 1c9603d..a0764ad 100644 --- a/smmap/mman.py +++ b/smmap/mman.py @@ -398,7 +398,9 @@ def make_cursor(self, path_or_fd): **Note:** Using file descriptors directly is faster once new windows are mapped as it prevents the file to be opened again just for the purpose of mapping it.""" regions = self._fdict.get(path_or_fd) - if not regions or not regions.scream_if_closed(): + if regions: + assert not regions.collect_closed_regions(), regions.collect_closed_regions() + else: regions = self.MapRegionListCls(path_or_fd) self._fdict[path_or_fd] = regions # END obtain region for path diff --git a/smmap/util.py b/smmap/util.py index 9abf1f5..ac3d9f5 100644 --- a/smmap/util.py +++ b/smmap/util.py @@ -273,10 +273,9 @@ def file_size(self): # END update file size return self._file_size - def scream_if_closed(self): - for r in self: - if getattr(r, '_mf.closed', None): # > `closed` attribute PY3.2+ - raise Exception('Found closed region: %s' % r._mf) - return self + def collect_closed_regions(self): + """a PY3+ utility for assertions""" + # The `closed` attribute is PY3.2+ + return [region for region in self if getattr(region._mf, 'closed', None)] #} END utility classes From 88e276952639711f1bca7774cd04d96a5564aae3 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Mon, 24 Oct 2016 18:21:37 +0200 Subject: [PATCH 05/19] chore(ver): bump 2.0.1-->2.1.0.dev1 --- smmap/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smmap/__init__.py b/smmap/__init__.py index 7335971..2148b50 100644 --- a/smmap/__init__.py +++ b/smmap/__init__.py @@ -3,7 +3,7 @@ __author__ = "Sebastian Thiel" __contact__ = "byronimo@gmail.com" __homepage__ = "https://github.com/Byron/smmap" -version_info = (2, 1, 0, 'dev0') +version_info = (2, 1, 0, 'dev1') __version__ = '.'.join(str(i) for i in version_info) # make everything available in root package for convenience From 133dd1c9eb26c41683b15ea3459e8b7de5603358 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Mon, 24 Oct 2016 23:42:02 +0200 Subject: [PATCH 06/19] style(listuple): pep8, literals for empty lists/tuples --- smmap/__init__.py | 4 ++-- smmap/buf.py | 5 ++--- smmap/mman.py | 2 +- smmap/test/test_buf.py | 12 +++++++----- smmap/util.py | 6 +++--- 5 files changed, 15 insertions(+), 14 deletions(-) diff --git a/smmap/__init__.py b/smmap/__init__.py index 2148b50..065a82a 100644 --- a/smmap/__init__.py +++ b/smmap/__init__.py @@ -7,5 +7,5 @@ __version__ = '.'.join(str(i) for i in version_info) # make everything available in root package for convenience -from .mman import * -from .buf import * +from .mman import * # @IgnorePep8 +from .buf import * # @IgnorePep8 diff --git a/smmap/buf.py b/smmap/buf.py index 438292b..cf73970 100644 --- a/smmap/buf.py +++ b/smmap/buf.py @@ -3,12 +3,11 @@ __all__ = ["SlidingWindowMapBuffer"] -import sys try: bytes except NameError: - bytes = str + bytes = str # @ReservedAssignment class SlidingWindowMapBuffer(object): @@ -103,7 +102,7 @@ def __getslice__(self, i, j): # END while there are bytes to read return out else: - md = list() + md = [] while l: c.use_region(ofs, l) assert c.is_valid() diff --git a/smmap/mman.py b/smmap/mman.py index a0764ad..4c00515 100644 --- a/smmap/mman.py +++ b/smmap/mman.py @@ -483,7 +483,7 @@ class SlidingWindowMapManager(StaticWindowMapManager): a safe amount of memory already, which would possibly cause memory allocations to fail as our address space is full.""" - __slots__ = tuple() + __slots__ = () def __init__(self, window_size=-1, max_memory_size=0, max_open_handles=sys.maxsize): """Adjusts the default window size to -1""" diff --git a/smmap/test/test_buf.py b/smmap/test/test_buf.py index 3b6009e..a7bc1b3 100644 --- a/smmap/test/test_buf.py +++ b/smmap/test/test_buf.py @@ -29,10 +29,10 @@ def test_basics(self): # invalid paths fail upon construction c = man_optimal.make_cursor(fc.path) - self.assertRaises(ValueError, SlidingWindowMapBuffer, type(c)()) # invalid cursor - self.assertRaises(ValueError, SlidingWindowMapBuffer, c, fc.size) # offset too large + self.assertRaises(ValueError, SlidingWindowMapBuffer, type(c)()) # invalid cursor + self.assertRaises(ValueError, SlidingWindowMapBuffer, c, fc.size) # offset too large - buf = SlidingWindowMapBuffer() # can create uninitailized buffers + buf = SlidingWindowMapBuffer() # can create uninitailized buffers assert buf.cursor() is None # can call end access any time @@ -118,8 +118,10 @@ def test_basics(self): elapsed = max(time() - st, 0.001) # prevent zero division errors on windows mb = float(1000 * 1000) mode_str = (access_mode and "slice") or "single byte" - print("%s: Made %i random %s accesses to buffer created from %s reading a total of %f mb in %f s (%f mb/s)" - % (man_id, max_num_accesses, mode_str, type(item), num_bytes / mb, elapsed, (num_bytes / mb) / elapsed), + print("%s: Made %i random %s accesses to buffer created from %s " + "reading a total of %f mb in %f s (%f mb/s)" + % (man_id, max_num_accesses, mode_str, type(item), + num_bytes / mb, elapsed, (num_bytes / mb) / elapsed), file=sys.stderr) # END handle access mode del buf diff --git a/smmap/util.py b/smmap/util.py index ac3d9f5..ab1f8a3 100644 --- a/smmap/util.py +++ b/smmap/util.py @@ -18,7 +18,7 @@ try: # Python 2 - buffer = buffer + buffer = buffer # @UndefinedVariable except NameError: # Python 3 has no `buffer`; only `memoryview` def buffer(obj, offset, size): @@ -32,7 +32,7 @@ def string_types(): if sys.version_info[0] >= 3: return str else: - return basestring + return basestring # @UndefinedVariable def align_to_mmap(num, round_up): @@ -111,9 +111,9 @@ class MapRegion(object): **Note:** deallocates used region automatically on destruction""" __slots__ = [ '_b', # beginning of mapping + '_size', # cached size of our memory map '_mf', # mapped memory chunk (as returned by mmap) '_uc', # total amount of usages - '_size', # cached size of our memory map '__weakref__' ] _need_compat_layer = sys.version_info[:2] < (2, 6) From 7c1eac7944c680ca191fc8e54d92e301ae4b79a9 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Mon, 24 Oct 2016 23:42:39 +0200 Subject: [PATCH 07/19] refact(region): rename offset `_b --> _ofs` --- smmap/mman.py | 14 +++++++------- smmap/util.py | 24 ++++++++++++------------ 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/smmap/mman.py b/smmap/mman.py index 4c00515..41a1cbc 100644 --- a/smmap/mman.py +++ b/smmap/mman.py @@ -133,7 +133,7 @@ def use_region(self, offset=0, size=0, flags=0): self._region.increment_client_count() # END need region handling - self._ofs = offset - self._region._b + self._ofs = offset - self._region._ofs self._size = min(size, self._region.ofs_end() - offset) return self @@ -179,12 +179,12 @@ def ofs_begin(self): """:return: offset to the first byte pointed to by our cursor **Note:** only if is_valid() is True""" - return self._region._b + self._ofs + return self._region._ofs + self._ofs def ofs_end(self): """:return: offset to one past the last available byte""" # unroll method calls for performance ! - return self._region._b + self._ofs + self._size + return self._region._ofs + self._ofs + self._size def size(self): """:return: amount of bytes we point to""" @@ -201,7 +201,7 @@ def includes_ofs(self, ofs): **Note:** cursor must be valid for this to work""" # unroll methods - return (self._region._b + self._ofs) <= ofs < (self._region._b + self._ofs + self._size) + return (self._region._ofs + self._ofs) <= ofs < (self._region._ofs + self._ofs + self._size) def file_size(self): """:return: size of the underlying file""" @@ -497,7 +497,7 @@ def _obtain_region(self, a, offset, size, flags, is_recursive): hi = len(a) while lo < hi: mid = (lo + hi) // 2 - ofs = a[mid]._b + ofs = a[mid]._ofs if ofs <= offset: if a[mid].includes_ofs(offset): r = a[mid] @@ -526,14 +526,14 @@ def _obtain_region(self, a, offset, size, flags, is_recursive): insert_pos = 0 len_regions = len(a) if len_regions == 1: - if a[0]._b <= offset: + if a[0]._ofs <= offset: insert_pos = 1 # END maintain sort else: # find insert position insert_pos = len_regions for i, region in enumerate(a): - if region._b > offset: + if region._ofs > offset: insert_pos = i break # END if insert position is correct diff --git a/smmap/util.py b/smmap/util.py index ab1f8a3..409e56a 100644 --- a/smmap/util.py +++ b/smmap/util.py @@ -63,7 +63,7 @@ class MapWindow(object): """Utility type which is used to snap windows towards each other, and to adjust their size""" __slots__ = ( 'ofs', # offset into the file in bytes - 'size' # size of the window in bytes + 'size' # size of the window in bytes ) def __init__(self, offset, size): @@ -76,7 +76,7 @@ def __repr__(self): @classmethod def from_region(cls, region): """:return: new window from a region""" - return cls(region._b, region.size()) + return cls(region._ofs, region.size()) def ofs_end(self): return self.ofs + self.size @@ -110,10 +110,10 @@ class MapRegion(object): **Note:** deallocates used region automatically on destruction""" __slots__ = [ - '_b', # beginning of mapping + '_ofs', # beginning of mapping '_size', # cached size of our memory map - '_mf', # mapped memory chunk (as returned by mmap) - '_uc', # total amount of usages + '_mf', # mapped memory chunk (as returned by mmap) + '_uc', # total amount of usages '__weakref__' ] _need_compat_layer = sys.version_info[:2] < (2, 6) @@ -133,7 +133,7 @@ def __init__(self, path_or_fd, ofs, size, flags=0): allocated the the size automatically adjusted :param flags: additional flags to be given when opening the file. :raise Exception: if no memory can be allocated""" - self._b = ofs + self._ofs = ofs self._size = 0 self._uc = 0 @@ -174,7 +174,7 @@ def __init__(self, path_or_fd, ofs, size, flags=0): self.increment_client_count() def __repr__(self): - return "MapRegion<%i, %i>" % (self._b, self.size()) + return "MapRegion<%i, %i>" % (self._ofs, self.size()) #{ Interface @@ -188,7 +188,7 @@ def map(self): def ofs_begin(self): """:return: absolute byte offset to the first byte of the mapping""" - return self._b + return self._ofs def size(self): """:return: total size of the mapped region in bytes""" @@ -196,11 +196,11 @@ def size(self): def ofs_end(self): """:return: Absolute offset to one byte beyond the mapping into the file""" - return self._b + self._size + return self._ofs + self._size def includes_ofs(self, ofs): """:return: True if the given offset can be read in our mapped region""" - return self._b <= ofs < self._b + self._size + return self._ofs <= ofs < self._ofs + self._size def client_count(self): """:return: number of clients currently using this region""" @@ -227,7 +227,7 @@ def release(self): # re-define all methods which need offset adjustments in compatibility mode if _need_compat_layer: def size(self): - return self._size - self._b + return self._size - self._ofs def ofs_end(self): # always the size - we are as large as it gets @@ -237,7 +237,7 @@ def buffer(self): return self._mfb def includes_ofs(self, ofs): - return self._b <= ofs < self._size + return self._ofs <= ofs < self._size # END handle compat layer #} END interface From bba086aa50c8d57ba84973482d23897e876fe121 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Tue, 25 Oct 2016 11:43:18 +0200 Subject: [PATCH 08/19] refact(minor): use region.priv-func, close fd on same condition --- smmap/mman.py | 6 ++++-- smmap/util.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/smmap/mman.py b/smmap/mman.py index 41a1cbc..1146bf6 100644 --- a/smmap/mman.py +++ b/smmap/mman.py @@ -319,9 +319,11 @@ def _collect_lru_region(self, size): lru_list = None for regions in self._fdict.values(): for region in regions: - # check client count - if it's 1, it's just us + ## Check client count - if it's 1, it's just us. + # if (region.client_count() == 1 and - (lru_region is None or region._uc < lru_region._uc)): + (lru_region is None or + region.client_count() < lru_region.client_count())): lru_region = region lru_list = regions # END update lru_region diff --git a/smmap/util.py b/smmap/util.py index 409e56a..472633e 100644 --- a/smmap/util.py +++ b/smmap/util.py @@ -166,7 +166,7 @@ def __init__(self, path_or_fd, ofs, size, flags=0): self._mfb = buffer(self._mf, ofs, self._size) # END handle buffer wrapping finally: - if isinstance(path_or_fd, string_types()): + if not isinstance(path_or_fd, int): os.close(fd) # END only close it if we opened it # END close file handle From a2bc2d2e983e34b576f87127a531fdf978ecc322 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Tue, 25 Oct 2016 18:39:33 +0200 Subject: [PATCH 09/19] feat(mman): BREAKING API `mman` as context-manager to release regions + Add PY3 compat utilities + doc(changes, tutorial): update on mman usage --- doc/source/changes.rst | 36 ++-- doc/source/tutorial.rst | 148 ++++++++------- smmap/mman.py | 84 +++++++-- smmap/test/test_buf.py | 190 +++++++++---------- smmap/test/test_mman.py | 355 ++++++++++++++++++------------------ smmap/test/test_tutorial.py | 115 ++++++------ smmap/util.py | 5 +- 7 files changed, 519 insertions(+), 414 deletions(-) diff --git a/doc/source/changes.rst b/doc/source/changes.rst index f99e85f..3600adc 100644 --- a/doc/source/changes.rst +++ b/doc/source/changes.rst @@ -2,34 +2,46 @@ Changelog ######### -********** +2.1.0 +====== + +* **BREAKING API:** etrofit ``git.util.mman`` as context-manager, + to release memory-mapped regions held. + + The *mmap-manager(s)* are re-entrant, but not thread-safe **context-manager(s)**, + to be used within a ``with ...:`` block, ensuring any left-overs cursors are cleaned up. + If not entered, :meth:`StaticWindowMapManager.make_cursor()` and/or + :meth:`WindowCursor.use_region()` will scream. + + Get them from ``smmap.managed_mmaps()``. + v0.9.0 -********** +======== - Fixed issue with resources never being freed as mmaps were never closed. - Client counting is now done manually, instead of relying on pyton's reference count -********** + v0.8.5 -********** +======== - Fixed Python 3.0-3.3 regression, which also causes smmap to become about 3 times slower depending on the code path. It's related to this bug (http://bugs.python.org/issue15958), which was fixed in python 3.4 -********** + v0.8.4 -********** +======== - Fixed Python 3 performance regression -********** + v0.8.3 -********** +======== - Cleaned up code and assured it works sufficiently well with python 3 -********** + v0.8.1 -********** +======== - A single bugfix -********** + v0.8.0 -********** +======== - Initial Release diff --git a/doc/source/tutorial.rst b/doc/source/tutorial.rst index 917b245..710f9d7 100644 --- a/doc/source/tutorial.rst +++ b/doc/source/tutorial.rst @@ -5,91 +5,111 @@ Usage Guide ########### This text briefly introduces you to the basic design decisions and accompanying classes. -****** Design -****** -Per application, there is *MemoryManager* which is held as static instance and used throughout the application. It can be configured to keep your resources within certain limits. +====== +Per application, there must be a *MemoryManager* to be used throughout the application. +It can be configured to keep your resources within certain limits. -To access mapped regions, you require a cursor. Cursors point to exactly one file and serve as handles into it. As long as it exists, the respective memory region will remain available. +To access mapped regions, you require a cursor. Cursors point to exactly one file and serve as handles into it. +As long as it exists, the respective memory region will remain available. + +For convenience, a buffer implementation is provided which handles cursors and resource allocation +behind its simple buffer like interface. -For convenience, a buffer implementation is provided which handles cursors and resource allocation behind its simple buffer like interface. -*************** Memory Managers -*************** -There are two types of memory managers, one uses *static* windows, the other one uses *sliding* windows. A window is a region of a file mapped into memory. Although the names might be somewhat misleading as technically windows are always static, the *sliding* version will allocate relatively small windows whereas the *static* version will always map the whole file. +================ +There are two types of memory managers, one uses *static* windows, the other one uses *sliding* windows. +A window is a region of a file mapped into memory. Although the names might be somewhat misleading, +as technically windows are always static, the *sliding* version will allocate relatively small windows +whereas the *static* version will always map the whole file. + +The *static* memory-manager does nothing more than keeping a client count on the respective memory maps +which always map the whole file, which allows to make some assumptions that can lead to simplified +data access and increased performance, but reduces the compatibility to 32 bit systems or giant files. + +The *sliding* memory-manager therefore should be the default manager when preparing an application +for handling huge amounts of data on 32 bit and 64 bit platforms -The *static* manager does nothing more than keeping a client count on the respective memory maps which always map the whole file, which allows to make some assumptions that can lead to simplified data access and increased performance, but reduces the compatibility to 32 bit systems or giant files. +.. Note:: + The *mmap-manager(s)* are re-entrant, but not thread-safe **context-manager(s)**, + to be used within a ``with ...:`` block, ensuring any left-overs cursors are cleaned up. + If not entered, :meth:`StaticWindowMapManager.make_cursor()` and/or + :meth:`WindowCursor.use_region()` will scream. -The *sliding* memory manager therefore should be the default manager when preparing an application for handling huge amounts of data on 32 bit and 64 bit platforms:: + +Use the :math:`smmap.managed_mmaps()` to take care of all this:: import smmap # This instance should be globally available in your application # It is configured to be well suitable for 32-bit or 64 bit applications. - mman = smmap.SlidingWindowMapManager() + with smmap.managed_mmaps() as mman: - # the manager provides much useful information about its current state - # like the amount of open file handles or the amount of mapped memory - mman.num_file_handles() - mman.mapped_memory_size() - # and many more ... + # the manager provides much useful information about its current state + # like the amount of open file handles or the amount of mapped memory + mman.num_file_handles() + mman.mapped_memory_size() + # and many more ... Cursors -******* +======== *Cursors* are handles that point onto a window, i.e. a region of a file mapped into memory. From them you may obtain a buffer through which the data of that window can actually be accessed:: import smmap.test.lib - fc = smmap.test.lib.FileCreator(1024*1024*8, "test_file") - - # obtain a cursor to access some file. - c = mman.make_cursor(fc.path) - - # the cursor is now associated with the file, but not yet usable - assert c.is_associated() - assert not c.is_valid() - - # before you can use the cursor, you have to specify a window you want to - # access. The following just says you want as much data as possible starting - # from offset 0. - # To be sure your region could be mapped, query for validity - assert c.use_region().is_valid() # use_region returns self - - # once a region was mapped, you must query its dimension regularly - # to assure you don't try to access its buffer out of its bounds - assert c.size() - c.buffer()[0] # first byte - c.buffer()[1:10] # first 9 bytes - c.buffer()[c.size()-1] # last byte - - # its recommended not to create big slices when feeding the buffer - # into consumers (e.g. struct or zlib). - # Instead, either give the buffer directly, or use pythons buffer command. - buffer(c.buffer(), 1, 9) # first 9 bytes without copying them - - # you can query absolute offsets, and check whether an offset is included - # in the cursor's data. - assert c.ofs_begin() < c.ofs_end() - assert c.includes_ofs(100) - - # If you are over out of bounds with one of your region requests, the - # cursor will be come invalid. It cannot be used in that state - assert not c.use_region(fc.size, 100).is_valid() - # map as much as possible after skipping the first 100 bytes - assert c.use_region(100).is_valid() - - # You can explicitly free cursor resources by unusing the cursor's region - c.unuse_region() - assert not c.is_valid() + + with smmap.managed_mmaps() as mman: + fc = smmap.test.lib.FileCreator(1024*1024*8, "test_file") + + # obtain a cursor to access some file. + c = mman.make_cursor(fc.path) + + # the cursor is now associated with the file, but not yet usable + assert c.is_associated() + assert not c.is_valid() + + # before you can use the cursor, you have to specify a window you want to + # access. The following just says you want as much data as possible starting + # from offset 0. + # To be sure your region could be mapped, query for validity + assert c.use_region().is_valid() # use_region returns self + + # once a region was mapped, you must query its dimension regularly + # to assure you don't try to access its buffer out of its bounds + assert c.size() + c.buffer()[0] # first byte + c.buffer()[1:10] # first 9 bytes + c.buffer()[c.size()-1] # last byte + + # its recommended not to create big slices when feeding the buffer + # into consumers (e.g. struct or zlib). + # Instead, either give the buffer directly, or use pythons buffer command. + buffer(c.buffer(), 1, 9) # first 9 bytes without copying them + + # you can query absolute offsets, and check whether an offset is included + # in the cursor's data. + assert c.ofs_begin() < c.ofs_end() + assert c.includes_ofs(100) + + # If you are over out of bounds with one of your region requests, the + # cursor will be come invalid. It cannot be used in that state + assert not c.use_region(fc.size, 100).is_valid() + # map as much as possible after skipping the first 100 bytes + assert c.use_region(100).is_valid() + + # You can explicitly free cursor resources by unusing the cursor's region + c.unuse_region() + assert not c.is_valid() Now you would have to write your algorithms around this interface to properly slide through huge amounts of data. Alternatively you can use a convenience interface. -******* + +======== Buffers -******* +======== To make first use easier, at the expense of performance, there is a Buffer implementation which uses a cursor underneath. With it, you can access all data in a possibly huge file without having to take care of setting the cursor to different regions yourself:: @@ -112,7 +132,9 @@ With it, you can access all data in a possibly huge file without having to take # it will stop using resources automatically once it goes out of scope -Disadvantages -************* -Buffers cannot be used in place of strings or maps, hence you have to slice them to have valid input for the sorts of struct and zlib. A slice means a lot of data handling overhead which makes buffers slower compared to using cursors directly. + Disadvantages + -------------- + Buffers cannot be used in place of strings or maps, hence you have to slice them to have valid + input for the sorts of struct and zlib. + A slice means a lot of data handling overhead which makes buffers slower compared to using cursors directly. diff --git a/smmap/mman.py b/smmap/mman.py index 1146bf6..1e63667 100644 --- a/smmap/mman.py +++ b/smmap/mman.py @@ -1,5 +1,10 @@ """Module containing a memory memory manager which provides a sliding window on a number of memory mapped files""" +from functools import reduce +import logging +import sys + from .util import ( + PY3, MapWindow, MapRegion, MapRegionList, @@ -8,15 +13,29 @@ buffer, ) -import sys -from functools import reduce -__all__ = ["StaticWindowMapManager", "SlidingWindowMapManager", "WindowCursor"] +__all__ = ['managed_mmaps', "StaticWindowMapManager", "SlidingWindowMapManager", "WindowCursor"] #{ Utilities - +log = logging.getLogger(__name__) #}END utilities +def managed_mmaps(): + """Makes a memory-map context-manager instance for the correct python-version. + + :return: either :class:`SlidingWindowMapManager` or :class:`StaticWindowMapManager` (if PY2) + + If you want to change the default parameters of these classes, use them directly. + + .. Tip:: + Use it in a ``with ...:`` block, to free cached (and unused) resources. + + """ + mman = SlidingWindowMapManager if PY3 else StaticWindowMapManager + + return mman() + + class WindowCursor(object): """ @@ -25,9 +44,15 @@ class WindowCursor(object): Cursors should not be created manually, but are instead returned by the SlidingWindowMapManager - **Note:**: The current implementation is suited for static and sliding window managers, but it also means - that it must be suited for the somewhat quite different sliding manager. It could be improved, but - I see no real need to do so.""" + .. Tip:: + This is a re-entrant, but not thread-safe context-manager, to be used within a ``with ...:`` block, + to ensure any left-overs cursors are cleaned up. If not entered, :meth:`use_region()`` + will scream. + + .. Note:: + The current implementation is suited for static and sliding window managers, + but it also means that it must be suited for the somewhat quite different sliding manager. + It could be improved, but I see no real need to do so.""" __slots__ = ( '_manager', # the manger keeping all file regions '_rlist', # a regions list with regions for our file @@ -110,6 +135,10 @@ def use_region(self, offset=0, size=0, flags=0): **Note:**: The size actually mapped may be smaller than the given size. If that is the case, either the file has reached its end, or the map was created between two existing regions""" + if self._manager._entered <= 0: + raise ValueError('Context-manager %s not entered for %s!' % + (self._manager, self)) + need_region = True man = self._manager fsize = self._rlist.file_size() @@ -243,15 +272,23 @@ class StaticWindowMapManager(object): These clients would have to use a SlidingWindowMapBuffer to hide this fact. This type will always use a maximum window size, and optimize certain methods to - accommodate this fact""" + accommodate this fact + + .. Tip:: + The *memory-managers* are re-entrant, but not thread-safe context-manager(s), + to be used within a ``with ...:`` block, ensuring any left-overs cursors are cleaned up. + If not entered, :meth:`make_cursor()` and/or :meth:`WindowCursor.use_region()` will scream. + + """ __slots__ = [ - '_fdict', # mapping of path -> StorageHelper (of some kind - '_window_size', # maximum size of a window - '_max_memory_size', # maximum amount of memory we may allocate - '_max_handle_count', # maximum amount of handles to keep open - '_memory_size', # currently allocated memory size + '_fdict', # mapping of path -> StorageHelper (of some kind + '_window_size', # maximum size of a window + '_max_memory_size', # maximum amount of memory we may allocate + '_max_handle_count', # maximum amount of handles to keep open + '_memory_size', # currently allocated memory size '_handle_count', # amount of currently allocated file handles + '_entered', # updated on enter/exit, when 0, `close()` ] #{ Configuration @@ -280,6 +317,7 @@ def __init__(self, window_size=0, max_memory_size=0, max_open_handles=sys.maxsiz self._max_handle_count = max_open_handles self._memory_size = 0 self._handle_count = 0 + self._entered = 0 if window_size < 0: coeff = 64 @@ -297,6 +335,23 @@ def __init__(self, window_size=0, max_memory_size=0, max_open_handles=sys.maxsiz self._max_memory_size = coeff * self._MB_in_bytes # END handle max memory size + def __enter__(self): + assert self._entered >= 0, self._entered + self._entered += 1 + + return self + + def __exit__(self, exc_type, exc_value, traceback): + assert self._entered > 0, self._entered + self._entered -= 1 + if self._entered == 0: + leaft_overs = self.collect() + if leaft_overs: + log.warning("Cleaned up %s left-over mmap-regions.") + + def close(self): + self.collect() + #{ Internal Methods def _collect_lru_region(self, size): @@ -399,6 +454,9 @@ def make_cursor(self, path_or_fd): **Note:** Using file descriptors directly is faster once new windows are mapped as it prevents the file to be opened again just for the purpose of mapping it.""" + if self._entered <= 0: + raise ValueError('Context-manager %s not entered!' % self) + regions = self._fdict.get(path_or_fd) if regions: assert not regions.collect_closed_regions(), regions.collect_closed_regions() diff --git a/smmap/test/test_buf.py b/smmap/test/test_buf.py index a7bc1b3..a2c2e7d 100644 --- a/smmap/test/test_buf.py +++ b/smmap/test/test_buf.py @@ -25,106 +25,112 @@ class TestBuf(TestBase): def test_basics(self): + # invalid paths fail upon construction + with FileCreator(self.k_window_test_size, "buffer_test") as fc: + with man_optimal: + c = man_optimal.make_cursor(fc.path) + self.assertRaises(ValueError, SlidingWindowMapBuffer, type(c)()) # invalid cursor + self.assertRaises(ValueError, SlidingWindowMapBuffer, c, fc.size) # offset too large + + buf = SlidingWindowMapBuffer() # can create uninitailized buffers + assert buf.cursor() is None + + # can call end access any time + buf.end_access() + buf.end_access() + assert len(buf) == 0 + + # begin access can revive it, if the offset is suitable + offset = 100 + assert buf.begin_access(c, fc.size) == False + assert buf.begin_access(c, offset) == True + assert len(buf) == fc.size - offset + assert buf.cursor().is_valid() + + # empty begin access keeps it valid on the same path, but alters the offset + assert buf.begin_access() == True + assert len(buf) == fc.size + assert buf.cursor().is_valid() + + # simple access + with open(fc.path, 'rb') as fp: + data = fp.read() + assert data[offset] == buf[0] + assert data[offset:offset * 2] == buf[0:offset] + + # negative indices, partial slices + assert buf[-1] == buf[len(buf) - 1] + assert buf[-10:] == buf[len(buf) - 10:len(buf)] + + # end access makes its cursor invalid + buf.end_access() + assert not buf.cursor().is_valid() + assert buf.cursor().is_associated() # but it remains associated + + # an empty begin access fixes it up again + assert buf.begin_access() == True and buf.cursor().is_valid() + del(buf) # ends access automatically + del(c) + + assert man_optimal.num_file_handles() == 1 + + def test_performance(self): + # PERFORMANCE + # blast away with random access and a full mapping - we don't want to + # exaggerate the manager's overhead, but measure the buffer overhead + # We do it once with an optimal setting, and with a worse manager which + # will produce small mappings only ! with FileCreator(self.k_window_test_size, "buffer_test") as fc: - - # invalid paths fail upon construction - c = man_optimal.make_cursor(fc.path) - self.assertRaises(ValueError, SlidingWindowMapBuffer, type(c)()) # invalid cursor - self.assertRaises(ValueError, SlidingWindowMapBuffer, c, fc.size) # offset too large - - buf = SlidingWindowMapBuffer() # can create uninitailized buffers - assert buf.cursor() is None - - # can call end access any time - buf.end_access() - buf.end_access() - assert len(buf) == 0 - - # begin access can revive it, if the offset is suitable - offset = 100 - assert buf.begin_access(c, fc.size) == False - assert buf.begin_access(c, offset) == True - assert len(buf) == fc.size - offset - assert buf.cursor().is_valid() - - # empty begin access keeps it valid on the same path, but alters the offset - assert buf.begin_access() == True - assert len(buf) == fc.size - assert buf.cursor().is_valid() - - # simple access with open(fc.path, 'rb') as fp: data = fp.read() - assert data[offset] == buf[0] - assert data[offset:offset * 2] == buf[0:offset] - - # negative indices, partial slices - assert buf[-1] == buf[len(buf) - 1] - assert buf[-10:] == buf[len(buf) - 10:len(buf)] - - # end access makes its cursor invalid - buf.end_access() - assert not buf.cursor().is_valid() - assert buf.cursor().is_associated() # but it remains associated - - # an empty begin access fixes it up again - assert buf.begin_access() == True and buf.cursor().is_valid() - del(buf) # ends access automatically - del(c) - - assert man_optimal.num_file_handles() == 1 - - # PERFORMANCE - # blast away with random access and a full mapping - we don't want to - # exaggerate the manager's overhead, but measure the buffer overhead - # We do it once with an optimal setting, and with a worse manager which - # will produce small mappings only ! + max_num_accesses = 100 fd = os.open(fc.path, os.O_RDONLY) for item in (fc.path, fd): for manager, man_id in ((man_optimal, 'optimal'), (man_worst_case, 'worst case'), (static_man, 'static optimal')): - buf = SlidingWindowMapBuffer(manager.make_cursor(item)) - assert manager.num_file_handles() == 1 - for access_mode in range(2): # single, multi - num_accesses_left = max_num_accesses - num_bytes = 0 - fsize = fc.size - - st = time() - buf.begin_access() - while num_accesses_left: - num_accesses_left -= 1 - if access_mode: # multi - ofs_start = randint(0, fsize) - ofs_end = randint(ofs_start, fsize) - d = buf[ofs_start:ofs_end] - assert len(d) == ofs_end - ofs_start - assert d == data[ofs_start:ofs_end] - num_bytes += len(d) - del d - else: - pos = randint(0, fsize) - assert buf[pos] == data[pos] - num_bytes += 1 - # END handle mode - # END handle num accesses - - buf.end_access() - assert manager.num_file_handles() - assert manager.collect() - assert manager.num_file_handles() == 0 - elapsed = max(time() - st, 0.001) # prevent zero division errors on windows - mb = float(1000 * 1000) - mode_str = (access_mode and "slice") or "single byte" - print("%s: Made %i random %s accesses to buffer created from %s " - "reading a total of %f mb in %f s (%f mb/s)" - % (man_id, max_num_accesses, mode_str, type(item), - num_bytes / mb, elapsed, (num_bytes / mb) / elapsed), - file=sys.stderr) - # END handle access mode - del buf - # END for each manager + with manager: + buf = SlidingWindowMapBuffer(manager.make_cursor(item)) + assert manager.num_file_handles() == 1 + for access_mode in range(2): # single, multi + num_accesses_left = max_num_accesses + num_bytes = 0 + fsize = fc.size + + st = time() + buf.begin_access() + while num_accesses_left: + num_accesses_left -= 1 + if access_mode: # multi + ofs_start = randint(0, fsize) + ofs_end = randint(ofs_start, fsize) + d = buf[ofs_start:ofs_end] + assert len(d) == ofs_end - ofs_start + assert d == data[ofs_start:ofs_end] + num_bytes += len(d) + del d + else: + pos = randint(0, fsize) + assert buf[pos] == data[pos] + num_bytes += 1 + # END handle mode + # END handle num accesses + + buf.end_access() + assert manager.num_file_handles() + assert manager.collect() + assert manager.num_file_handles() == 0 + elapsed = max(time() - st, 0.001) # prevent zero division errors on windows + mb = float(1000 * 1000) + mode_str = (access_mode and "slice") or "single byte" + print("%s: Made %i random %s accesses to buffer created from %s " + "reading a total of %f mb in %f s (%f mb/s)" + % (man_id, max_num_accesses, mode_str, type(item), + num_bytes / mb, elapsed, (num_bytes / mb) / elapsed), + file=sys.stderr) + # END handle access mode + del buf + # END for each manager # END for each input os.close(fd) diff --git a/smmap/test/test_mman.py b/smmap/test/test_mman.py index 96bc355..263d8fb 100644 --- a/smmap/test/test_mman.py +++ b/smmap/test/test_mman.py @@ -20,80 +20,81 @@ class TestMMan(TestBase): def test_cursor(self): with FileCreator(self.k_window_test_size, "cursor_test") as fc: - man = SlidingWindowMapManager() - ci = WindowCursor(man) # invalid cursor - assert not ci.is_valid() + with SlidingWindowMapManager() as man: + ci = WindowCursor(man) # invalid cursor + assert not ci.is_valid() + assert not ci.is_associated() + assert ci.size() == 0 # this is cached, so we can query it in invalid state + + cv = man.make_cursor(fc.path) + assert not cv.is_valid() # no region mapped yet + assert cv.is_associated() # but it know where to map it from + assert cv.file_size() == fc.size + assert cv.path() == fc.path + + # copy module + cio = copy(cv) + assert not cio.is_valid() and cio.is_associated() + + # assign method assert not ci.is_associated() - assert ci.size() == 0 # this is cached, so we can query it in invalid state + ci.assign(cv) + assert not ci.is_valid() and ci.is_associated() - cv = man.make_cursor(fc.path) - assert not cv.is_valid() # no region mapped yet - assert cv.is_associated() # but it know where to map it from - assert cv.file_size() == fc.size - assert cv.path() == fc.path + # unuse non-existing region is fine + cv.unuse_region() + cv.unuse_region() - # copy module - cio = copy(cv) - assert not cio.is_valid() and cio.is_associated() - - # assign method - assert not ci.is_associated() - ci.assign(cv) - assert not ci.is_valid() and ci.is_associated() - - # unuse non-existing region is fine - cv.unuse_region() - cv.unuse_region() - - # destruction is fine (even multiple times) - cv._destroy() - WindowCursor(man)._destroy() + # destruction is fine (even multiple times) + cv._destroy() + WindowCursor(man)._destroy() def test_memory_manager(self): slide_man = SlidingWindowMapManager() static_man = StaticWindowMapManager() for man in (static_man, slide_man): - assert man.num_file_handles() == 0 - assert man.num_open_files() == 0 - winsize_cmp_val = 0 - if isinstance(man, StaticWindowMapManager): - winsize_cmp_val = -1 - # END handle window size - assert man.window_size() > winsize_cmp_val - assert man.mapped_memory_size() == 0 - assert man.max_mapped_memory_size() > 0 - - # collection doesn't raise in 'any' mode - man._collect_lru_region(0) - # doesn't raise if we are within the limit - man._collect_lru_region(10) - - # doesn't fail if we over-allocate - assert man._collect_lru_region(sys.maxsize) == 0 - - # use a region, verify most basic functionality - with FileCreator(self.k_window_test_size, "manager_test") as fc: - fd = os.open(fc.path, os.O_RDONLY) - try: - for item in (fc.path, fd): - c = man.make_cursor(item) - assert c.path_or_fd() is item - assert c.use_region(10, 10).is_valid() - assert c.ofs_begin() == 10 - assert c.size() == 10 - with open(fc.path, 'rb') as fp: - assert c.buffer()[:] == fp.read(20)[10:] - - if isinstance(item, int): - self.assertRaises(ValueError, c.path) - else: - self.assertRaises(ValueError, c.fd) - # END handle value error - # END for each input - finally: - os.close(fd) - # END for each manasger type + with man: + assert man.num_file_handles() == 0 + assert man.num_open_files() == 0 + winsize_cmp_val = 0 + if isinstance(man, StaticWindowMapManager): + winsize_cmp_val = -1 + # END handle window size + assert man.window_size() > winsize_cmp_val + assert man.mapped_memory_size() == 0 + assert man.max_mapped_memory_size() > 0 + + # collection doesn't raise in 'any' mode + man._collect_lru_region(0) + # doesn't raise if we are within the limit + man._collect_lru_region(10) + + # doesn't fail if we over-allocate + assert man._collect_lru_region(sys.maxsize) == 0 + + # use a region, verify most basic functionality + with FileCreator(self.k_window_test_size, "manager_test") as fc: + fd = os.open(fc.path, os.O_RDONLY) + try: + for item in (fc.path, fd): + c = man.make_cursor(item) + assert c.path_or_fd() is item + assert c.use_region(10, 10).is_valid() + assert c.ofs_begin() == 10 + assert c.size() == 10 + with open(fc.path, 'rb') as fp: + assert c.buffer()[:] == fp.read(20)[10:] + + if isinstance(item, int): + self.assertRaises(ValueError, c.path) + else: + self.assertRaises(ValueError, c.fd) + # END handle value error + # END for each input + finally: + os.close(fd) + # END for each manager type def test_memman_operation(self): # test more access, force it to actually unmap regions @@ -110,117 +111,119 @@ def test_memman_operation(self): assert len(data) == fc.size # small windows, a reasonable max memory. Not too many regions at once - man = mtype(window_size=args[0], max_memory_size=args[1], max_open_handles=args[2]) - c = man.make_cursor(item) - - # still empty (more about that is tested in test_memory_manager() - assert man.num_open_files() == 0 - assert man.mapped_memory_size() == 0 - - base_offset = 5000 - # window size is 0 for static managers, hence size will be 0. We take that into consideration - size = man.window_size() // 2 - assert c.use_region(base_offset, size).is_valid() - rr = c.region() - assert rr.client_count() == 2 # the manager and the cursor and us - - assert man.num_open_files() == 1 - assert man.num_file_handles() == 1 - assert man.mapped_memory_size() == rr.size() - - # assert c.size() == size # the cursor may overallocate in its static version - assert c.ofs_begin() == base_offset - assert rr.ofs_begin() == 0 # it was aligned and expanded - if man.window_size(): - # but isn't larger than the max window (aligned) - assert rr.size() == align_to_mmap(man.window_size(), True) - else: - assert rr.size() == fc.size - # END ignore static managers which dont use windows and are aligned to file boundaries - - assert c.buffer()[:] == data[base_offset:base_offset + (size or c.size())] - - # obtain second window, which spans the first part of the file - it is a still the same window - nsize = (size or fc.size) - 10 - assert c.use_region(0, nsize).is_valid() - assert c.region() == rr - assert man.num_file_handles() == 1 - assert c.size() == nsize - assert c.ofs_begin() == 0 - assert c.buffer()[:] == data[:nsize] - - # map some part at the end, our requested size cannot be kept - overshoot = 4000 - base_offset = fc.size - (size or c.size()) + overshoot - assert c.use_region(base_offset, size).is_valid() - if man.window_size(): - assert man.num_file_handles() == 2 - assert c.size() < size - assert c.region() is not rr # old region is still available, but has not curser ref anymore - assert rr.client_count() == 1 # only held by manager - else: - assert c.size() < fc.size - # END ignore static managers which only have one handle per file - rr = c.region() - assert rr.client_count() == 2 # manager + cursor - assert rr.ofs_begin() < c.ofs_begin() # it should have extended itself to the left - assert rr.ofs_end() <= fc.size # it cannot be larger than the file - assert c.buffer()[:] == data[base_offset:base_offset + (size or c.size())] - - # unising a region makes the cursor invalid - c.unuse_region() - assert not c.is_valid() - if man.window_size(): - # but doesn't change anything regarding the handle count - we cache it and only - # remove mapped regions if we have to - assert man.num_file_handles() == 2 - # END ignore this for static managers - - # iterate through the windows, verify data contents - # this will trigger map collection after a while - max_random_accesses = 5000 - num_random_accesses = max_random_accesses - memory_read = 0 - st = time() - - # cache everything to get some more performance - includes_ofs = c.includes_ofs - max_mapped_memory_size = man.max_mapped_memory_size() - max_file_handles = man.max_file_handles() - mapped_memory_size = man.mapped_memory_size - num_file_handles = man.num_file_handles - while num_random_accesses: - num_random_accesses -= 1 - base_offset = randint(0, fc.size - 1) - - # precondition + with mtype(window_size=args[0], max_memory_size=args[1], max_open_handles=args[2]) as man: + c = man.make_cursor(item) + + # still empty (more about that is tested in test_memory_manager() + assert man.num_open_files() == 0 + assert man.mapped_memory_size() == 0 + + base_offset = 5000 + # window size is 0 for static managers, hence size will be 0. We take that into consideration + size = man.window_size() // 2 + assert c.use_region(base_offset, size).is_valid() + rr = c.region() + assert rr.client_count() == 2 # the manager and the cursor and us + + assert man.num_open_files() == 1 + assert man.num_file_handles() == 1 + assert man.mapped_memory_size() == rr.size() + + # assert c.size() == size # the cursor may overallocate in its static version + assert c.ofs_begin() == base_offset + assert rr.ofs_begin() == 0 # it was aligned and expanded + if man.window_size(): + # but isn't larger than the max window (aligned) + assert rr.size() == align_to_mmap(man.window_size(), True) + else: + assert rr.size() == fc.size + # END ignore static managers which dont use windows and are aligned to file boundaries + + assert c.buffer()[:] == data[base_offset:base_offset + (size or c.size())] + + # obtain second window, which spans the first part of the file - it is a still the same window + nsize = (size or fc.size) - 10 + assert c.use_region(0, nsize).is_valid() + assert c.region() == rr + assert man.num_file_handles() == 1 + assert c.size() == nsize + assert c.ofs_begin() == 0 + assert c.buffer()[:] == data[:nsize] + + # map some part at the end, our requested size cannot be kept + overshoot = 4000 + base_offset = fc.size - (size or c.size()) + overshoot + assert c.use_region(base_offset, size).is_valid() + if man.window_size(): + assert man.num_file_handles() == 2 + assert c.size() < size + assert c.region() is not rr # old region is still available, but has not curser ref anymore + assert rr.client_count() == 1 # only held by manager + else: + assert c.size() < fc.size + # END ignore static managers which only have one handle per file + rr = c.region() + assert rr.client_count() == 2 # manager + cursor + assert rr.ofs_begin() < c.ofs_begin() # it should have extended itself to the left + assert rr.ofs_end() <= fc.size # it cannot be larger than the file + assert c.buffer()[:] == data[base_offset:base_offset + (size or c.size())] + + # unising a region makes the cursor invalid + c.unuse_region() + assert not c.is_valid() if man.window_size(): - assert max_mapped_memory_size >= mapped_memory_size() - # END statics will overshoot, which is fine - assert max_file_handles >= num_file_handles() - assert c.use_region(base_offset, (size or c.size())).is_valid() - csize = c.size() - assert c.buffer()[:] == data[base_offset:base_offset + csize] - memory_read += csize - - assert includes_ofs(base_offset) - assert includes_ofs(base_offset + csize - 1) - assert not includes_ofs(base_offset + csize) - # END while we should do an access - elapsed = max(time() - st, 0.001) # prevent zero divison errors on windows - mb = float(1000 * 1000) - print("%s: Read %i mb of memory with %i random on cursor initialized with %s accesses in %fs (%f mb/s)\n" - % (mtype, memory_read / mb, max_random_accesses, type(item), elapsed, (memory_read / mb) / elapsed), - file=sys.stderr) - - # an offset as large as the size doesn't work ! - assert not c.use_region(fc.size, size).is_valid() - - # collection - it should be able to collect all - assert man.num_file_handles() - assert man.collect() - assert man.num_file_handles() == 0 - # END for each item - # END for each manager type + # but doesn't change anything regarding the handle count - we cache it and only + # remove mapped regions if we have to + assert man.num_file_handles() == 2 + # END ignore this for static managers + + # iterate through the windows, verify data contents + # this will trigger map collection after a while + max_random_accesses = 5000 + num_random_accesses = max_random_accesses + memory_read = 0 + st = time() + + # cache everything to get some more performance + includes_ofs = c.includes_ofs + max_mapped_memory_size = man.max_mapped_memory_size() + max_file_handles = man.max_file_handles() + mapped_memory_size = man.mapped_memory_size + num_file_handles = man.num_file_handles + while num_random_accesses: + num_random_accesses -= 1 + base_offset = randint(0, fc.size - 1) + + # precondition + if man.window_size(): + assert max_mapped_memory_size >= mapped_memory_size() + # END statics will overshoot, which is fine + assert max_file_handles >= num_file_handles() + assert c.use_region(base_offset, (size or c.size())).is_valid() + csize = c.size() + assert c.buffer()[:] == data[base_offset:base_offset + csize] + memory_read += csize + + assert includes_ofs(base_offset) + assert includes_ofs(base_offset + csize - 1) + assert not includes_ofs(base_offset + csize) + # END while we should do an access + elapsed = max(time() - st, 0.001) # prevent zero divison errors on windows + mb = float(1000 * 1000) + print("%s: Read %i mb of memory with %i random on cursor " + "initialized with %s accesses in %fs (%f mb/s)\n" + % (mtype, memory_read / mb, max_random_accesses, + type(item), elapsed, (memory_read / mb) / elapsed), + file=sys.stderr) + + # an offset as large as the size doesn't work ! + assert not c.use_region(fc.size, size).is_valid() + + # collection - it should be able to collect all + assert man.num_file_handles() + assert man.collect() + assert man.num_file_handles() == 0 + # END for each item + # END for each manager type finally: os.close(fd) diff --git a/smmap/test/test_tutorial.py b/smmap/test/test_tutorial.py index b03db9b..2f43ea1 100644 --- a/smmap/test/test_tutorial.py +++ b/smmap/test/test_tutorial.py @@ -22,60 +22,61 @@ def test_example(self): import smmap.test.lib with smmap.test.lib.FileCreator(1024 * 1024 * 8, "test_file") as fc: # obtain a cursor to access some file. - c = mman.make_cursor(fc.path) - - # the cursor is now associated with the file, but not yet usable - assert c.is_associated() - assert not c.is_valid() - - # before you can use the cursor, you have to specify a window you want to - # access. The following just says you want as much data as possible starting - # from offset 0. - # To be sure your region could be mapped, query for validity - assert c.use_region().is_valid() # use_region returns self - - # once a region was mapped, you must query its dimension regularly - # to assure you don't try to access its buffer out of its bounds - assert c.size() - c.buffer()[0] # first byte - c.buffer()[1:10] # first 9 bytes - c.buffer()[c.size() - 1] # last byte - - # its recommended not to create big slices when feeding the buffer - # into consumers (e.g. struct or zlib). - # Instead, either give the buffer directly, or use pythons buffer command. - from smmap.util import buffer - buffer(c.buffer(), 1, 9) # first 9 bytes without copying them - - # you can query absolute offsets, and check whether an offset is included - # in the cursor's data. - assert c.ofs_begin() < c.ofs_end() - assert c.includes_ofs(100) - - # If you are over out of bounds with one of your region requests, the - # cursor will be come invalid. It cannot be used in that state - assert not c.use_region(fc.size, 100).is_valid() - # map as much as possible after skipping the first 100 bytes - assert c.use_region(100).is_valid() - - # You can explicitly free cursor resources by unusing the cursor's region - c.unuse_region() - assert not c.is_valid() - - # Buffers - ######### - # Create a default buffer which can operate on the whole file - buf = smmap.SlidingWindowMapBuffer(mman.make_cursor(fc.path)) - - # you can use it right away - assert buf.cursor().is_valid() - - buf[0] # access the first byte - buf[-1] # access the last ten bytes on the file - buf[-10:] # access the last ten bytes - - # If you want to keep the instance between different accesses, use the - # dedicated methods - buf.end_access() - assert not buf.cursor().is_valid() # you cannot use the buffer anymore - assert buf.begin_access(offset=10) # start using the buffer at an offset + with mman: + c = mman.make_cursor(fc.path) + + # the cursor is now associated with the file, but not yet usable + assert c.is_associated() + assert not c.is_valid() + + # before you can use the cursor, you have to specify a window you want to + # access. The following just says you want as much data as possible starting + # from offset 0. + # To be sure your region could be mapped, query for validity + assert c.use_region().is_valid() # use_region returns self + + # once a region was mapped, you must query its dimension regularly + # to assure you don't try to access its buffer out of its bounds + assert c.size() + c.buffer()[0] # first byte + c.buffer()[1:10] # first 9 bytes + c.buffer()[c.size() - 1] # last byte + + # its recommended not to create big slices when feeding the buffer + # into consumers (e.g. struct or zlib). + # Instead, either give the buffer directly, or use pythons buffer command. + from smmap.util import buffer + buffer(c.buffer(), 1, 9) # first 9 bytes without copying them + + # you can query absolute offsets, and check whether an offset is included + # in the cursor's data. + assert c.ofs_begin() < c.ofs_end() + assert c.includes_ofs(100) + + # If you are over out of bounds with one of your region requests, the + # cursor will be come invalid. It cannot be used in that state + assert not c.use_region(fc.size, 100).is_valid() + # map as much as possible after skipping the first 100 bytes + assert c.use_region(100).is_valid() + + # You can explicitly free cursor resources by unusing the cursor's region + c.unuse_region() + assert not c.is_valid() + + # Buffers + ######### + # Create a default buffer which can operate on the whole file + buf = smmap.SlidingWindowMapBuffer(mman.make_cursor(fc.path)) + + # you can use it right away + assert buf.cursor().is_valid() + + buf[0] # access the first byte + buf[-1] # access the last ten bytes on the file + buf[-10:] # access the last ten bytes + + # If you want to keep the instance between different accesses, use the + # dedicated methods + buf.end_access() + assert not buf.cursor().is_valid() # you cannot use the buffer anymore + assert buf.begin_access(offset=10) # start using the buffer at an offset diff --git a/smmap/util.py b/smmap/util.py index 472633e..076f0ba 100644 --- a/smmap/util.py +++ b/smmap/util.py @@ -28,8 +28,11 @@ def buffer(obj, offset, size): # return obj[offset:offset + size] +PY3 = sys.version_info[0] >= 3 + + def string_types(): - if sys.version_info[0] >= 3: + if PY3: return str else: return basestring # @UndefinedVariable From 01df7f33136644b189e8b6a981180cbc4c77f490 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Tue, 25 Oct 2016 18:48:31 +0200 Subject: [PATCH 10/19] chore(ver): bump 2.1.0.dev1-->2.1.0.dev3 --- smmap/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smmap/__init__.py b/smmap/__init__.py index 065a82a..d1ba606 100644 --- a/smmap/__init__.py +++ b/smmap/__init__.py @@ -3,7 +3,7 @@ __author__ = "Sebastian Thiel" __contact__ = "byronimo@gmail.com" __homepage__ = "https://github.com/Byron/smmap" -version_info = (2, 1, 0, 'dev1') +version_info = (2, 1, 0, 'dev3') __version__ = '.'.join(str(i) for i in version_info) # make everything available in root package for convenience From bf68f775d7a9e176fa518d1d10cca6d351582684 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Tue, 25 Oct 2016 20:24:08 +0200 Subject: [PATCH 11/19] feat(mman-contxt): opt-out not to scream if mman not entered --- smmap/mman.py | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/smmap/mman.py b/smmap/mman.py index 1e63667..a7004e1 100644 --- a/smmap/mman.py +++ b/smmap/mman.py @@ -20,20 +20,23 @@ #}END utilities -def managed_mmaps(): +def managed_mmaps(check_entered=True): """Makes a memory-map context-manager instance for the correct python-version. - :return: either :class:`SlidingWindowMapManager` or :class:`StaticWindowMapManager` (if PY2) + :param bool check_entered: + whether to scream if not used as context-manager (`with` block) + :return: + either :class:`SlidingWindowMapManager` or :class:`StaticWindowMapManager` (if PY2) - If you want to change the default parameters of these classes, use them directly. + If you want to change other default parameters of these classes, use them directly. - .. Tip:: - Use it in a ``with ...:`` block, to free cached (and unused) resources. + .. Tip:: + Use it in a ``with ...:`` block, to free cached (and unused) resources. """ mman = SlidingWindowMapManager if PY3 else StaticWindowMapManager - return mman() + return mman(check_entered=check_entered) class WindowCursor(object): @@ -135,9 +138,7 @@ def use_region(self, offset=0, size=0, flags=0): **Note:**: The size actually mapped may be smaller than the given size. If that is the case, either the file has reached its end, or the map was created between two existing regions""" - if self._manager._entered <= 0: - raise ValueError('Context-manager %s not entered for %s!' % - (self._manager, self)) + self._manager._check_if_entered() need_region = True man = self._manager @@ -289,6 +290,7 @@ class StaticWindowMapManager(object): '_memory_size', # currently allocated memory size '_handle_count', # amount of currently allocated file handles '_entered', # updated on enter/exit, when 0, `close()` + 'check_entered', # bool, whether to scream if not used as context-manager (`with` block) ] #{ Configuration @@ -300,7 +302,8 @@ class StaticWindowMapManager(object): _MB_in_bytes = 1024 * 1024 - def __init__(self, window_size=0, max_memory_size=0, max_open_handles=sys.maxsize): + def __init__(self, window_size=0, max_memory_size=0, max_open_handles=sys.maxsize, + check_entered=True): """initialize the manager with the given parameters. :param window_size: if -1, a default window size will be chosen depending on the operating system's architecture. It will internally be quantified to a multiple of the page size @@ -310,7 +313,9 @@ def __init__(self, window_size=0, max_memory_size=0, max_open_handles=sys.maxsiz It is a soft limit that is tried to be kept, but nothing bad happens if we have to over-allocate :param max_open_handles: if not maxint, limit the amount of open file handles to the given number. Otherwise the amount is only limited by the system itself. If a system or soft limit is hit, - the manager will free as many handles as possible""" + the manager will free as many handles as possible + :param bool check_entered: whether to scream if not used as context-manager (`with` block) + """ self._fdict = dict() self._window_size = window_size self._max_memory_size = max_memory_size @@ -318,6 +323,7 @@ def __init__(self, window_size=0, max_memory_size=0, max_open_handles=sys.maxsiz self._memory_size = 0 self._handle_count = 0 self._entered = 0 + self.check_entered = check_entered if window_size < 0: coeff = 64 @@ -435,6 +441,10 @@ def _obtain_region(self, a, offset, size, flags, is_recursive): assert r.includes_ofs(offset) return r + def _check_if_entered(self): + if self.check_entered and self._entered <= 0: + raise ValueError('Context-manager %s not entered!' % self) + #}END internal methods #{ Interface @@ -454,8 +464,7 @@ def make_cursor(self, path_or_fd): **Note:** Using file descriptors directly is faster once new windows are mapped as it prevents the file to be opened again just for the purpose of mapping it.""" - if self._entered <= 0: - raise ValueError('Context-manager %s not entered!' % self) + self._check_if_entered() regions = self._fdict.get(path_or_fd) if regions: @@ -545,9 +554,10 @@ class SlidingWindowMapManager(StaticWindowMapManager): __slots__ = () - def __init__(self, window_size=-1, max_memory_size=0, max_open_handles=sys.maxsize): + def __init__(self, window_size=-1, max_memory_size=0, max_open_handles=sys.maxsize, + check_entered=True): """Adjusts the default window size to -1""" - super(SlidingWindowMapManager, self).__init__(window_size, max_memory_size, max_open_handles) + super(SlidingWindowMapManager, self).__init__(window_size, max_memory_size, max_open_handles, check_entered) def _obtain_region(self, a, offset, size, flags, is_recursive): # bisect to find an existing region. The c++ implementation cannot From 4598966aba7bbf8873de5e29893117b95f601379 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Tue, 25 Oct 2016 20:27:16 +0200 Subject: [PATCH 12/19] fix(leaks): attempt to gc-collect before region-collect futile... --- smmap/mman.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/smmap/mman.py b/smmap/mman.py index a7004e1..2b4a6fd 100644 --- a/smmap/mman.py +++ b/smmap/mman.py @@ -12,6 +12,7 @@ string_types, buffer, ) +import gc __all__ = ['managed_mmaps', "StaticWindowMapManager", "SlidingWindowMapManager", "WindowCursor"] @@ -351,6 +352,9 @@ def __exit__(self, exc_type, exc_value, traceback): assert self._entered > 0, self._entered self._entered -= 1 if self._entered == 0: + # Try to close all file-handles + #(a *Windows* only issue, and probably not fixed) + gc.collect() leaft_overs = self.collect() if leaft_overs: log.warning("Cleaned up %s left-over mmap-regions.") From d0bd74ec3ee0b66ca50e096497234c7ec72f8854 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Tue, 25 Oct 2016 21:57:13 +0200 Subject: [PATCH 13/19] fix(mman): exit log-msg were missing left-overs arg, log as debug --- smmap/mman.py | 2 +- smmap/util.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/smmap/mman.py b/smmap/mman.py index 2b4a6fd..e4d0bbd 100644 --- a/smmap/mman.py +++ b/smmap/mman.py @@ -357,7 +357,7 @@ def __exit__(self, exc_type, exc_value, traceback): gc.collect() leaft_overs = self.collect() if leaft_overs: - log.warning("Cleaned up %s left-over mmap-regions.") + log.debug("Cleaned up %s left-over mmap-regions." % leaft_overs) def close(self): self.collect() diff --git a/smmap/util.py b/smmap/util.py index 076f0ba..4cb6658 100644 --- a/smmap/util.py +++ b/smmap/util.py @@ -144,8 +144,6 @@ def __init__(self, path_or_fd, ofs, size, flags=0): fd = path_or_fd else: fd = os.open(path_or_fd, os.O_RDONLY | getattr(os, 'O_BINARY', 0) | flags) - # END handle fd - try: kwargs = dict(access=ACCESS_READ, offset=ofs) corrected_size = size From e33235a24d1e3cd9a41079af2b3edd09c5b4a02a Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Tue, 25 Oct 2016 22:01:51 +0200 Subject: [PATCH 14/19] refact(buf): also use SlidingWindowMapBuffer as optional context-manager + doc(tutorial): update use-cases + doc(changes): new bullet. --- doc/source/changes.rst | 14 +-- doc/source/tutorial.rst | 103 ++++++++++----------- smmap/buf.py | 11 ++- smmap/test/test_buf.py | 178 ++++++++++++++++++------------------ smmap/test/test_tutorial.py | 27 +++--- 5 files changed, 168 insertions(+), 165 deletions(-) diff --git a/doc/source/changes.rst b/doc/source/changes.rst index 3600adc..37a9494 100644 --- a/doc/source/changes.rst +++ b/doc/source/changes.rst @@ -5,16 +5,18 @@ Changelog 2.1.0 ====== -* **BREAKING API:** etrofit ``git.util.mman`` as context-manager, +- **BREAKING API:** retrofit ``git.util.mman`` as context-manager, to release memory-mapped regions held. - - The *mmap-manager(s)* are re-entrant, but not thread-safe **context-manager(s)**, - to be used within a ``with ...:`` block, ensuring any left-overs cursors are cleaned up. - If not entered, :meth:`StaticWindowMapManager.make_cursor()` and/or + + The *mmap-manager(s)* are re-entrant, but not thread-safe **context-manager(s)**, + to be used within a ``with ...:`` block, ensuring any left-overs cursors are cleaned up. + If not entered, :meth:`StaticWindowMapManager.make_cursor()` and/or :meth:`WindowCursor.use_region()` will scream. Get them from ``smmap.managed_mmaps()``. +- Retrofit :class:`SlidingWindowMapBuffer` also as context-manager. + v0.9.0 ======== - Fixed issue with resources never being freed as mmaps were never closed. @@ -41,7 +43,7 @@ v0.8.1 - A single bugfix -v0.8.0 +v0.8.0 ======== - Initial Release diff --git a/doc/source/tutorial.rst b/doc/source/tutorial.rst index 710f9d7..346f813 100644 --- a/doc/source/tutorial.rst +++ b/doc/source/tutorial.rst @@ -7,34 +7,34 @@ This text briefly introduces you to the basic design decisions and accompanying Design ====== -Per application, there must be a *MemoryManager* to be used throughout the application. +Per application, there must be a *MemoryManager* to be used throughout the application. It can be configured to keep your resources within certain limits. To access mapped regions, you require a cursor. Cursors point to exactly one file and serve as handles into it. As long as it exists, the respective memory region will remain available. -For convenience, a buffer implementation is provided which handles cursors and resource allocation +For convenience, a buffer implementation is provided which handles cursors and resource allocation behind its simple buffer like interface. Memory Managers ================ -There are two types of memory managers, one uses *static* windows, the other one uses *sliding* windows. -A window is a region of a file mapped into memory. Although the names might be somewhat misleading, -as technically windows are always static, the *sliding* version will allocate relatively small windows +There are two types of memory managers, one uses *static* windows, the other one uses *sliding* windows. +A window is a region of a file mapped into memory. Although the names might be somewhat misleading, +as technically windows are always static, the *sliding* version will allocate relatively small windows whereas the *static* version will always map the whole file. -The *static* memory-manager does nothing more than keeping a client count on the respective memory maps -which always map the whole file, which allows to make some assumptions that can lead to simplified +The *static* memory-manager does nothing more than keeping a client count on the respective memory maps +which always map the whole file, which allows to make some assumptions that can lead to simplified data access and increased performance, but reduces the compatibility to 32 bit systems or giant files. -The *sliding* memory-manager therefore should be the default manager when preparing an application +The *sliding* memory-manager therefore should be the default manager when preparing an application for handling huge amounts of data on 32 bit and 64 bit platforms .. Note:: - The *mmap-manager(s)* are re-entrant, but not thread-safe **context-manager(s)**, - to be used within a ``with ...:`` block, ensuring any left-overs cursors are cleaned up. - If not entered, :meth:`StaticWindowMapManager.make_cursor()` and/or + The *mmap-manager(s)* are re-entrant, but not thread-safe **context-manager(s)**, + to be used within a ``with ...:`` block, ensuring any left-overs cursors are cleaned up. + If not entered, :meth:`StaticWindowMapManager.make_cursor()` and/or :meth:`WindowCursor.use_region()` will scream. @@ -44,7 +44,7 @@ Use the :math:`smmap.managed_mmaps()` to take care of all this:: # This instance should be globally available in your application # It is configured to be well suitable for 32-bit or 64 bit applications. with smmap.managed_mmaps() as mman: - + # the manager provides much useful information about its current state # like the amount of open file handles or the amount of mapped memory mman.num_file_handles() @@ -60,81 +60,82 @@ Cursors with smmap.managed_mmaps() as mman: fc = smmap.test.lib.FileCreator(1024*1024*8, "test_file") - + # obtain a cursor to access some file. c = mman.make_cursor(fc.path) - + # the cursor is now associated with the file, but not yet usable assert c.is_associated() assert not c.is_valid() - - # before you can use the cursor, you have to specify a window you want to + + # before you can use the cursor, you have to specify a window you want to # access. The following just says you want as much data as possible starting # from offset 0. # To be sure your region could be mapped, query for validity assert c.use_region().is_valid() # use_region returns self - + # once a region was mapped, you must query its dimension regularly # to assure you don't try to access its buffer out of its bounds assert c.size() c.buffer()[0] # first byte c.buffer()[1:10] # first 9 bytes c.buffer()[c.size()-1] # last byte - + # its recommended not to create big slices when feeding the buffer - # into consumers (e.g. struct or zlib). + # into consumers (e.g. struct or zlib). # Instead, either give the buffer directly, or use pythons buffer command. buffer(c.buffer(), 1, 9) # first 9 bytes without copying them - + # you can query absolute offsets, and check whether an offset is included # in the cursor's data. assert c.ofs_begin() < c.ofs_end() assert c.includes_ofs(100) - - # If you are over out of bounds with one of your region requests, the + + # If you are over out of bounds with one of your region requests, the # cursor will be come invalid. It cannot be used in that state assert not c.use_region(fc.size, 100).is_valid() # map as much as possible after skipping the first 100 bytes assert c.use_region(100).is_valid() - - # You can explicitly free cursor resources by unusing the cursor's region + + # You must explicitly free cursor resources by unusing the cursor's region c.unuse_region() assert not c.is_valid() - -Now you would have to write your algorithms around this interface to properly slide through huge amounts of data. - + +Now you would have to write your algorithms around this interface to properly slide through huge amounts of data. + Alternatively you can use a convenience interface. ======== Buffers ======== -To make first use easier, at the expense of performance, there is a Buffer implementation which uses a cursor underneath. +To make first use easier, at the expense of performance, there is a Buffer implementation +which uses a cursor underneath. -With it, you can access all data in a possibly huge file without having to take care of setting the cursor to different regions yourself:: +With it, you can access all data in a possibly huge file +without having to take care of setting the cursor to different regions yourself:: # Create a default buffer which can operate on the whole file - buf = smmap.SlidingWindowMapBuffer(mman.make_cursor(fc.path)) - - # you can use it right away - assert buf.cursor().is_valid() - - buf[0] # access the first byte - buf[-1] # access the last ten bytes on the file - buf[-10:]# access the last ten bytes - - # If you want to keep the instance between different accesses, use the - # dedicated methods - buf.end_access() - assert not buf.cursor().is_valid() # you cannot use the buffer anymore - assert buf.begin_access(offset=10) # start using the buffer at an offset - - # it will stop using resources automatically once it goes out of scope - - Disadvantages - -------------- - Buffers cannot be used in place of strings or maps, hence you have to slice them to have valid - input for the sorts of struct and zlib. - A slice means a lot of data handling overhead which makes buffers slower compared to using cursors directly. + with smmap.SlidingWindowMapBuffer(mman.make_cursor(fc.path)) as buf: + + # you can use it right away + assert buf.cursor().is_valid() + + buf[0] # access the first byte + buf[-1] # access the last ten bytes on the file + buf[-10:]# access the last ten bytes + + # If you want to keep the instance between different accesses, use the + # dedicated methods + buf.end_access() + assert not buf.cursor().is_valid() # you cannot use the buffer anymore + assert buf.begin_access(offset=10) # start using the buffer at an offset + + +Disadvantages +-------------- +Buffers cannot be used in place of strings or maps, hence you have to slice them to have valid +input for the sorts of struct and zlib. +A slice means a lot of data handling overhead which makes buffers slower compared to using cursors directly. diff --git a/smmap/buf.py b/smmap/buf.py index cf73970..aed4e4f 100644 --- a/smmap/buf.py +++ b/smmap/buf.py @@ -18,9 +18,14 @@ class SlidingWindowMapBuffer(object): The buffer is relative, that is if you map an offset, index 0 will map to the first byte at the offset you used during initialization or begin_access - **Note:** Although this type effectively hides the fact that there are mapped windows - underneath, it can unfortunately not be used in any non-pure python method which - needs a buffer or string""" + .. Tip:: + Use it as a context-manager inside a ``with SlidingWindowMapBuffer(...):`` block. + + .. Note:: + Although this type effectively hides the fact that there are mapped windows + underneath, it can unfortunately not be used in any non-pure python method which + needs a buffer or string + """ __slots__ = ( '_c', # our cursor '_size', # our supposed size diff --git a/smmap/test/test_buf.py b/smmap/test/test_buf.py index a2c2e7d..282354c 100644 --- a/smmap/test/test_buf.py +++ b/smmap/test/test_buf.py @@ -28,49 +28,47 @@ def test_basics(self): # invalid paths fail upon construction with FileCreator(self.k_window_test_size, "buffer_test") as fc: with man_optimal: - c = man_optimal.make_cursor(fc.path) - self.assertRaises(ValueError, SlidingWindowMapBuffer, type(c)()) # invalid cursor - self.assertRaises(ValueError, SlidingWindowMapBuffer, c, fc.size) # offset too large - - buf = SlidingWindowMapBuffer() # can create uninitailized buffers - assert buf.cursor() is None - - # can call end access any time - buf.end_access() - buf.end_access() - assert len(buf) == 0 - - # begin access can revive it, if the offset is suitable - offset = 100 - assert buf.begin_access(c, fc.size) == False - assert buf.begin_access(c, offset) == True - assert len(buf) == fc.size - offset - assert buf.cursor().is_valid() - - # empty begin access keeps it valid on the same path, but alters the offset - assert buf.begin_access() == True - assert len(buf) == fc.size - assert buf.cursor().is_valid() - - # simple access - with open(fc.path, 'rb') as fp: - data = fp.read() - assert data[offset] == buf[0] - assert data[offset:offset * 2] == buf[0:offset] - - # negative indices, partial slices - assert buf[-1] == buf[len(buf) - 1] - assert buf[-10:] == buf[len(buf) - 10:len(buf)] - - # end access makes its cursor invalid - buf.end_access() - assert not buf.cursor().is_valid() - assert buf.cursor().is_associated() # but it remains associated - - # an empty begin access fixes it up again - assert buf.begin_access() == True and buf.cursor().is_valid() - del(buf) # ends access automatically - del(c) + with man_optimal.make_cursor(fc.path) as c: + self.assertRaises(ValueError, SlidingWindowMapBuffer, type(c)()) # invalid cursor + self.assertRaises(ValueError, SlidingWindowMapBuffer, c, fc.size) # offset too large + + with SlidingWindowMapBuffer() as buf: # can create uninitailized buffers + assert buf.cursor() is None + + # can call end access any time + buf.end_access() + buf.end_access() + assert len(buf) == 0 + + # begin access can revive it, if the offset is suitable + offset = 100 + assert buf.begin_access(c, fc.size) == False + assert buf.begin_access(c, offset) == True + assert len(buf) == fc.size - offset + assert buf.cursor().is_valid() + + # empty begin access keeps it valid on the same path, but alters the offset + assert buf.begin_access() == True + assert len(buf) == fc.size + assert buf.cursor().is_valid() + + # simple access + with open(fc.path, 'rb') as fp: + data = fp.read() + assert data[offset] == buf[0] + assert data[offset:offset * 2] == buf[0:offset] + + # negative indices, partial slices + assert buf[-1] == buf[len(buf) - 1] + assert buf[-10:] == buf[len(buf) - 10:len(buf)] + + # end access makes its cursor invalid + buf.end_access() + assert not buf.cursor().is_valid() + assert buf.cursor().is_associated() # but it remains associated + + # an empty begin access fixes it up again + assert buf.begin_access() == True and buf.cursor().is_valid() assert man_optimal.num_file_handles() == 1 @@ -86,51 +84,49 @@ def test_performance(self): max_num_accesses = 100 fd = os.open(fc.path, os.O_RDONLY) - for item in (fc.path, fd): - for manager, man_id in ((man_optimal, 'optimal'), - (man_worst_case, 'worst case'), - (static_man, 'static optimal')): - with manager: - buf = SlidingWindowMapBuffer(manager.make_cursor(item)) - assert manager.num_file_handles() == 1 - for access_mode in range(2): # single, multi - num_accesses_left = max_num_accesses - num_bytes = 0 - fsize = fc.size - - st = time() - buf.begin_access() - while num_accesses_left: - num_accesses_left -= 1 - if access_mode: # multi - ofs_start = randint(0, fsize) - ofs_end = randint(ofs_start, fsize) - d = buf[ofs_start:ofs_end] - assert len(d) == ofs_end - ofs_start - assert d == data[ofs_start:ofs_end] - num_bytes += len(d) - del d - else: - pos = randint(0, fsize) - assert buf[pos] == data[pos] - num_bytes += 1 - # END handle mode - # END handle num accesses - - buf.end_access() - assert manager.num_file_handles() - assert manager.collect() - assert manager.num_file_handles() == 0 - elapsed = max(time() - st, 0.001) # prevent zero division errors on windows - mb = float(1000 * 1000) - mode_str = (access_mode and "slice") or "single byte" - print("%s: Made %i random %s accesses to buffer created from %s " - "reading a total of %f mb in %f s (%f mb/s)" - % (man_id, max_num_accesses, mode_str, type(item), - num_bytes / mb, elapsed, (num_bytes / mb) / elapsed), - file=sys.stderr) - # END handle access mode - del buf - # END for each manager - # END for each input - os.close(fd) + try: + for item in (fc.path, fd): + for manager, man_id in ((man_optimal, 'optimal'), + (man_worst_case, 'worst case'), + (static_man, 'static optimal')): + with manager: + with SlidingWindowMapBuffer(manager.make_cursor(item)) as buf: + assert manager.num_file_handles() == 1 + for access_mode in range(2): # single, multi + num_accesses_left = max_num_accesses + num_bytes = 0 + fsize = fc.size + + st = time() + buf.begin_access() + while num_accesses_left: + num_accesses_left -= 1 + if access_mode: # multi + ofs_start = randint(0, fsize) + ofs_end = randint(ofs_start, fsize) + d = buf[ofs_start:ofs_end] + assert len(d) == ofs_end - ofs_start + assert d == data[ofs_start:ofs_end] + num_bytes += len(d) + del d + else: + pos = randint(0, fsize) + assert buf[pos] == data[pos] + num_bytes += 1 + # END handle mode + # END handle num accesses + + buf.end_access() + assert manager.num_file_handles() + assert manager.collect() + assert manager.num_file_handles() == 0 + elapsed = max(time() - st, 0.001) # prevent zero division errors on windows + mb = float(1000 * 1000) + mode_str = (access_mode and "slice") or "single byte" + print("%s: Made %i random %s accesses to buffer created from %s " + "reading a total of %f mb in %f s (%f mb/s)" + % (man_id, max_num_accesses, mode_str, type(item), + num_bytes / mb, elapsed, (num_bytes / mb) / elapsed), + file=sys.stderr) + finally: + os.close(fd) diff --git a/smmap/test/test_tutorial.py b/smmap/test/test_tutorial.py index 2f43ea1..cfbc29f 100644 --- a/smmap/test/test_tutorial.py +++ b/smmap/test/test_tutorial.py @@ -66,17 +66,16 @@ def test_example(self): # Buffers ######### # Create a default buffer which can operate on the whole file - buf = smmap.SlidingWindowMapBuffer(mman.make_cursor(fc.path)) - - # you can use it right away - assert buf.cursor().is_valid() - - buf[0] # access the first byte - buf[-1] # access the last ten bytes on the file - buf[-10:] # access the last ten bytes - - # If you want to keep the instance between different accesses, use the - # dedicated methods - buf.end_access() - assert not buf.cursor().is_valid() # you cannot use the buffer anymore - assert buf.begin_access(offset=10) # start using the buffer at an offset + with smmap.SlidingWindowMapBuffer(mman.make_cursor(fc.path)) as buf: + # you can use it right away + assert buf.cursor().is_valid() + + buf[0] # access the first byte + buf[-1] # access the last ten bytes on the file + buf[-10:] # access the last ten bytes + + # If you want to keep the instance between different accesses, use the + # dedicated methods + buf.end_access() + assert not buf.cursor().is_valid() # you cannot use the buffer anymore + assert buf.begin_access(offset=10) # start using the buffer at an offset From 33f12e64438ff942ec401ef23697e6c31ee03a4a Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Tue, 25 Oct 2016 22:45:28 +0200 Subject: [PATCH 15/19] refact(TCs): unittestize assertions --- smmap/test/test_buf.py | 34 ++++++++++---------- smmap/test/test_mman.py | 70 ++++++++++++++++++++--------------------- 2 files changed, 52 insertions(+), 52 deletions(-) diff --git a/smmap/test/test_buf.py b/smmap/test/test_buf.py index 282354c..fe2ce82 100644 --- a/smmap/test/test_buf.py +++ b/smmap/test/test_buf.py @@ -38,29 +38,29 @@ def test_basics(self): # can call end access any time buf.end_access() buf.end_access() - assert len(buf) == 0 + self.assertEqual(len(buf), 0) # begin access can revive it, if the offset is suitable offset = 100 - assert buf.begin_access(c, fc.size) == False - assert buf.begin_access(c, offset) == True - assert len(buf) == fc.size - offset + self.assertEqual(buf.begin_access(c, fc.size), False) + self.assertEqual(buf.begin_access(c, offset), True) + self.assertEqual(len(buf), fc.size - offset) assert buf.cursor().is_valid() # empty begin access keeps it valid on the same path, but alters the offset - assert buf.begin_access() == True - assert len(buf) == fc.size + self.assertEqual(buf.begin_access(), True) + self.assertEqual(len(buf), fc.size) assert buf.cursor().is_valid() # simple access with open(fc.path, 'rb') as fp: data = fp.read() - assert data[offset] == buf[0] - assert data[offset:offset * 2] == buf[0:offset] + self.assertEqual(data[offset], buf[0]) + self.assertEqual(data[offset:offset * 2], buf[0:offset]) # negative indices, partial slices - assert buf[-1] == buf[len(buf) - 1] - assert buf[-10:] == buf[len(buf) - 10:len(buf)] + self.assertEqual(buf[-1], buf[len(buf) - 1]) + self.assertEqual(buf[-10:], buf[len(buf) - 10:len(buf)]) # end access makes its cursor invalid buf.end_access() @@ -68,9 +68,9 @@ def test_basics(self): assert buf.cursor().is_associated() # but it remains associated # an empty begin access fixes it up again - assert buf.begin_access() == True and buf.cursor().is_valid() + self.assertEqual(buf.begin_access(), True and buf.cursor().is_valid()) - assert man_optimal.num_file_handles() == 1 + self.assertEqual(man_optimal.num_file_handles(), 1) def test_performance(self): # PERFORMANCE @@ -91,7 +91,7 @@ def test_performance(self): (static_man, 'static optimal')): with manager: with SlidingWindowMapBuffer(manager.make_cursor(item)) as buf: - assert manager.num_file_handles() == 1 + self.assertEqual(manager.num_file_handles(), 1) for access_mode in range(2): # single, multi num_accesses_left = max_num_accesses num_bytes = 0 @@ -105,13 +105,13 @@ def test_performance(self): ofs_start = randint(0, fsize) ofs_end = randint(ofs_start, fsize) d = buf[ofs_start:ofs_end] - assert len(d) == ofs_end - ofs_start - assert d == data[ofs_start:ofs_end] + self.assertEqual(len(d), ofs_end - ofs_start) + self.assertEqual(d, data[ofs_start:ofs_end]) num_bytes += len(d) del d else: pos = randint(0, fsize) - assert buf[pos] == data[pos] + self.assertEqual(buf[pos], data[pos]) num_bytes += 1 # END handle mode # END handle num accesses @@ -119,7 +119,7 @@ def test_performance(self): buf.end_access() assert manager.num_file_handles() assert manager.collect() - assert manager.num_file_handles() == 0 + self.assertEqual(manager.num_file_handles(), 0) elapsed = max(time() - st, 0.001) # prevent zero division errors on windows mb = float(1000 * 1000) mode_str = (access_mode and "slice") or "single byte" diff --git a/smmap/test/test_mman.py b/smmap/test/test_mman.py index 263d8fb..469ab0e 100644 --- a/smmap/test/test_mman.py +++ b/smmap/test/test_mman.py @@ -24,13 +24,13 @@ def test_cursor(self): ci = WindowCursor(man) # invalid cursor assert not ci.is_valid() assert not ci.is_associated() - assert ci.size() == 0 # this is cached, so we can query it in invalid state + self.assertEqual(ci.size(), 0) # this is cached, so we can query it in invalid state cv = man.make_cursor(fc.path) assert not cv.is_valid() # no region mapped yet assert cv.is_associated() # but it know where to map it from - assert cv.file_size() == fc.size - assert cv.path() == fc.path + self.assertEqual(cv.file_size(), fc.size) + self.assertEqual(cv.path(), fc.path) # copy module cio = copy(cv) @@ -55,14 +55,14 @@ def test_memory_manager(self): for man in (static_man, slide_man): with man: - assert man.num_file_handles() == 0 - assert man.num_open_files() == 0 + self.assertEqual(man.num_file_handles(), 0) + self.assertEqual(man.num_open_files(), 0) winsize_cmp_val = 0 if isinstance(man, StaticWindowMapManager): winsize_cmp_val = -1 # END handle window size assert man.window_size() > winsize_cmp_val - assert man.mapped_memory_size() == 0 + self.assertEqual(man.mapped_memory_size(), 0) assert man.max_mapped_memory_size() > 0 # collection doesn't raise in 'any' mode @@ -71,7 +71,7 @@ def test_memory_manager(self): man._collect_lru_region(10) # doesn't fail if we over-allocate - assert man._collect_lru_region(sys.maxsize) == 0 + self.assertEqual(man._collect_lru_region(sys.maxsize), 0) # use a region, verify most basic functionality with FileCreator(self.k_window_test_size, "manager_test") as fc: @@ -81,10 +81,10 @@ def test_memory_manager(self): c = man.make_cursor(item) assert c.path_or_fd() is item assert c.use_region(10, 10).is_valid() - assert c.ofs_begin() == 10 - assert c.size() == 10 + self.assertEqual(c.ofs_begin(), 10) + self.assertEqual(c.size(), 10) with open(fc.path, 'rb') as fp: - assert c.buffer()[:] == fp.read(20)[10:] + self.assertEqual(c.buffer()[:], fp.read(20)[10:]) if isinstance(item, int): self.assertRaises(ValueError, c.path) @@ -108,65 +108,65 @@ def test_memman_operation(self): for mtype, args in ((StaticWindowMapManager, (0, fc.size // 3, max_num_handles)), (SlidingWindowMapManager, (fc.size // 100, fc.size // 3, max_num_handles)),): for item in (fc.path, fd): - assert len(data) == fc.size + self.assertEqual(len(data), fc.size) # small windows, a reasonable max memory. Not too many regions at once with mtype(window_size=args[0], max_memory_size=args[1], max_open_handles=args[2]) as man: c = man.make_cursor(item) # still empty (more about that is tested in test_memory_manager() - assert man.num_open_files() == 0 - assert man.mapped_memory_size() == 0 + self.assertEqual(man.num_open_files(), 0) + self.assertEqual(man.mapped_memory_size(), 0) base_offset = 5000 # window size is 0 for static managers, hence size will be 0. We take that into consideration size = man.window_size() // 2 assert c.use_region(base_offset, size).is_valid() rr = c.region() - assert rr.client_count() == 2 # the manager and the cursor and us + self.assertEqual(rr.client_count(), 2) # the manager and the cursor and us - assert man.num_open_files() == 1 - assert man.num_file_handles() == 1 - assert man.mapped_memory_size() == rr.size() + self.assertEqual(man.num_open_files(), 1) + self.assertEqual(man.num_file_handles(), 1) + self.assertEqual(man.mapped_memory_size(), rr.size()) - # assert c.size() == size # the cursor may overallocate in its static version - assert c.ofs_begin() == base_offset - assert rr.ofs_begin() == 0 # it was aligned and expanded + # self.assertEqual(c.size(), size # the cursor may overallocate in its static version) + self.assertEqual(c.ofs_begin(), base_offset) + self.assertEqual(rr.ofs_begin(), 0) # it was aligned and expanded if man.window_size(): # but isn't larger than the max window (aligned) - assert rr.size() == align_to_mmap(man.window_size(), True) + self.assertEqual(rr.size(), align_to_mmap(man.window_size(), True)) else: - assert rr.size() == fc.size + self.assertEqual(rr.size(), fc.size) # END ignore static managers which dont use windows and are aligned to file boundaries - assert c.buffer()[:] == data[base_offset:base_offset + (size or c.size())] + self.assertEqual(c.buffer()[:], data[base_offset:base_offset + (size or c.size())]) # obtain second window, which spans the first part of the file - it is a still the same window nsize = (size or fc.size) - 10 assert c.use_region(0, nsize).is_valid() - assert c.region() == rr - assert man.num_file_handles() == 1 - assert c.size() == nsize - assert c.ofs_begin() == 0 - assert c.buffer()[:] == data[:nsize] + self.assertEqual(c.region(), rr) + self.assertEqual(man.num_file_handles(), 1) + self.assertEqual(c.size(), nsize) + self.assertEqual(c.ofs_begin(), 0) + self.assertEqual(c.buffer()[:], data[:nsize]) # map some part at the end, our requested size cannot be kept overshoot = 4000 base_offset = fc.size - (size or c.size()) + overshoot assert c.use_region(base_offset, size).is_valid() if man.window_size(): - assert man.num_file_handles() == 2 + self.assertEqual(man.num_file_handles(), 2) assert c.size() < size assert c.region() is not rr # old region is still available, but has not curser ref anymore - assert rr.client_count() == 1 # only held by manager + self.assertEqual(rr.client_count(), 1) # only held by manager else: assert c.size() < fc.size # END ignore static managers which only have one handle per file rr = c.region() - assert rr.client_count() == 2 # manager + cursor + self.assertEqual(rr.client_count(), 2) # manager + cursor assert rr.ofs_begin() < c.ofs_begin() # it should have extended itself to the left assert rr.ofs_end() <= fc.size # it cannot be larger than the file - assert c.buffer()[:] == data[base_offset:base_offset + (size or c.size())] + self.assertEqual(c.buffer()[:], data[base_offset:base_offset + (size or c.size())]) # unising a region makes the cursor invalid c.unuse_region() @@ -174,7 +174,7 @@ def test_memman_operation(self): if man.window_size(): # but doesn't change anything regarding the handle count - we cache it and only # remove mapped regions if we have to - assert man.num_file_handles() == 2 + self.assertEqual(man.num_file_handles(), 2) # END ignore this for static managers # iterate through the windows, verify data contents @@ -201,7 +201,7 @@ def test_memman_operation(self): assert max_file_handles >= num_file_handles() assert c.use_region(base_offset, (size or c.size())).is_valid() csize = c.size() - assert c.buffer()[:] == data[base_offset:base_offset + csize] + self.assertEqual(c.buffer()[:], data[base_offset:base_offset + csize]) memory_read += csize assert includes_ofs(base_offset) @@ -222,7 +222,7 @@ def test_memman_operation(self): # collection - it should be able to collect all assert man.num_file_handles() assert man.collect() - assert man.num_file_handles() == 0 + self.assertEqual(man.num_file_handles(), 0) # END for each item # END for each manager type finally: From 8489c31d92c63a42273ab04c1d0690f495617ee8 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Tue, 25 Oct 2016 23:42:56 +0200 Subject: [PATCH 16/19] refact(buf): simplify API - no begin/end after construct + feat(mman): report missed exits. --- doc/source/changes.rst | 4 +- doc/source/tutorial.rst | 109 ++++++++++++++++++------------------ smmap/buf.py | 88 +++++++++++++++-------------- smmap/mman.py | 5 ++ smmap/test/test_buf.py | 69 ++++++++++------------- smmap/test/test_tutorial.py | 7 +-- 6 files changed, 140 insertions(+), 142 deletions(-) diff --git a/doc/source/changes.rst b/doc/source/changes.rst index 37a9494..e6b5d7a 100644 --- a/doc/source/changes.rst +++ b/doc/source/changes.rst @@ -15,7 +15,9 @@ Changelog Get them from ``smmap.managed_mmaps()``. -- Retrofit :class:`SlidingWindowMapBuffer` also as context-manager. +- Simplify :class:`SlidingWindowMapBuffer` as create/close context-manager + (no ``begin_access()``, or ``end_access()``). + v0.9.0 ======== diff --git a/doc/source/tutorial.rst b/doc/source/tutorial.rst index 346f813..8ba8d7c 100644 --- a/doc/source/tutorial.rst +++ b/doc/source/tutorial.rst @@ -8,13 +8,14 @@ This text briefly introduces you to the basic design decisions and accompanying Design ====== Per application, there must be a *MemoryManager* to be used throughout the application. -It can be configured to keep your resources within certain limits. +It can be configured to keep your resources within certain limits (see :func:`smmap.managed_mmaps()`). -To access mapped regions, you require a cursor. Cursors point to exactly one file and serve as handles into it. +To access mapped regions, you require a cursor. Cursors point to exactly one file +and serve as handles into it. As long as it exists, the respective memory region will remain available. -For convenience, a buffer implementation is provided which handles cursors and resource allocation -behind its simple buffer like interface. +For convenience, a buffer implementation is provided (:class:`smmap.SlidingWindowMapBuffer`) +which handles cursors and resource allocation behind its simple buffer like interface. Memory Managers @@ -56,50 +57,48 @@ Cursors ======== *Cursors* are handles that point onto a window, i.e. a region of a file mapped into memory. From them you may obtain a buffer through which the data of that window can actually be accessed:: - import smmap.test.lib - - with smmap.managed_mmaps() as mman: - fc = smmap.test.lib.FileCreator(1024*1024*8, "test_file") + import smmap.test.lib as tlib + with smmap.managed_mmaps() as mman, tlib.FileCreator(1024*1024*8, "test_file") as fc: # obtain a cursor to access some file. - c = mman.make_cursor(fc.path) - - # the cursor is now associated with the file, but not yet usable - assert c.is_associated() - assert not c.is_valid() - - # before you can use the cursor, you have to specify a window you want to - # access. The following just says you want as much data as possible starting - # from offset 0. - # To be sure your region could be mapped, query for validity - assert c.use_region().is_valid() # use_region returns self - - # once a region was mapped, you must query its dimension regularly - # to assure you don't try to access its buffer out of its bounds - assert c.size() - c.buffer()[0] # first byte - c.buffer()[1:10] # first 9 bytes - c.buffer()[c.size()-1] # last byte - - # its recommended not to create big slices when feeding the buffer - # into consumers (e.g. struct or zlib). - # Instead, either give the buffer directly, or use pythons buffer command. - buffer(c.buffer(), 1, 9) # first 9 bytes without copying them - - # you can query absolute offsets, and check whether an offset is included - # in the cursor's data. - assert c.ofs_begin() < c.ofs_end() - assert c.includes_ofs(100) - - # If you are over out of bounds with one of your region requests, the - # cursor will be come invalid. It cannot be used in that state - assert not c.use_region(fc.size, 100).is_valid() - # map as much as possible after skipping the first 100 bytes - assert c.use_region(100).is_valid() - - # You must explicitly free cursor resources by unusing the cursor's region - c.unuse_region() - assert not c.is_valid() + with mman.make_cursor(fc.path) as c: + + # the cursor is now associated with the file, but not yet usable + assert c.is_associated() + assert not c.is_valid() + + # before you can use the cursor, you have to specify a window you want to + # access. The following just says you want as much data as possible starting + # from offset 0. + # To be sure your region could be mapped, query for validity + assert c.use_region().is_valid() # use_region returns self + + # once a region was mapped, you must query its dimension regularly + # to assure you don't try to access its buffer out of its bounds + assert c.size() + c.buffer()[0] # first byte + c.buffer()[1:10] # first 9 bytes + c.buffer()[c.size()-1] # last byte + + # its recommended not to create big slices when feeding the buffer + # into consumers (e.g. struct or zlib). + # Instead, either give the buffer directly, or use pythons buffer command. + buffer(c.buffer(), 1, 9) # first 9 bytes without copying them + + # you can query absolute offsets, and check whether an offset is included + # in the cursor's data. + assert c.ofs_begin() < c.ofs_end() + assert c.includes_ofs(100) + + # If you are over out of bounds with one of your region requests, the + # cursor will be come invalid. It cannot be used in that state + assert not c.use_region(fc.size, 100).is_valid() + # map as much as possible after skipping the first 100 bytes + assert c.use_region(100).is_valid() + + # You must explicitly free cursor resources by unusing the cursor's region + c.unuse_region() + assert not c.is_valid() Now you would have to write your algorithms around this interface to properly slide through huge amounts of data. @@ -116,9 +115,9 @@ which uses a cursor underneath. With it, you can access all data in a possibly huge file without having to take care of setting the cursor to different regions yourself:: - # Create a default buffer which can operate on the whole file - with smmap.SlidingWindowMapBuffer(mman.make_cursor(fc.path)) as buf: - + ## Create a default buffer which can operate on the whole file + cur = mman.make_cursor(fc.path) + with smmap.SlidingWindowMapBuffer(cur) as buf: # you can use it right away assert buf.cursor().is_valid() @@ -126,11 +125,13 @@ without having to take care of setting the cursor to different regions yourself: buf[-1] # access the last ten bytes on the file buf[-10:]# access the last ten bytes - # If you want to keep the instance between different accesses, use the - # dedicated methods - buf.end_access() - assert not buf.cursor().is_valid() # you cannot use the buffer anymore - assert buf.begin_access(offset=10) # start using the buffer at an offset + ## You cannot use the buffer anymore. + assert not buf.cursor().is_valid() + + ## If you want to keep the instance between different accesses, + # use another instance. + with smmap.SlidingWindowMapBuffer(cur, offset=10) as buf: + assert buf.cursor().is_valid() Disadvantages diff --git a/smmap/buf.py b/smmap/buf.py index aed4e4f..7e48ced 100644 --- a/smmap/buf.py +++ b/smmap/buf.py @@ -1,5 +1,6 @@ """Module with a simple buffer implementation using the memory manager""" import sys +import logging __all__ = ["SlidingWindowMapBuffer"] @@ -10,6 +11,9 @@ bytes = str # @ReservedAssignment +log = logging.getLogger(__name__) + + class SlidingWindowMapBuffer(object): """A buffer like object which allows direct byte-wise object and slicing into @@ -29,11 +33,12 @@ class SlidingWindowMapBuffer(object): __slots__ = ( '_c', # our cursor '_size', # our supposed size + '_entered', # entry/exit accounting ) def __init__(self, cursor=None, offset=0, size=sys.maxsize, flags=0): - """Initalize the instance to operate on the given cursor. - :param cursor: if not None, the associated cursor to the file you want to access + """Initialize the instance to operate on the given cursor. + :param cursor: The associated cursor to the file you want to access If None, you have call begin_access before using the buffer and provide a cursor :param offset: absolute offset in bytes :param size: the total size of the mapping. Defaults to the maximum possible size @@ -43,24 +48,49 @@ def __init__(self, cursor=None, offset=0, size=sys.maxsize, flags=0): Hence it is in your own interest to provide a proper size ! :param flags: Additional flags to be passed to os.open :raise ValueError: if the buffer could not achieve a valid state""" + if not cursor: + raise ValueError("Cursor cannot be null!") self._c = cursor - if cursor and not self.begin_access(cursor, offset, size, flags): - raise ValueError("Failed to allocate the buffer - probably the given offset is out of bounds") - # END handle offset - - def __del__(self): - self.end_access() + self._entered = 0 + + if cursor.is_associated() and cursor.use_region(offset, size, flags).is_valid(): + # if given size is too large or default, we computer a proper size + # If its smaller, we assume the combination between offset and size + # as chosen by the user is correct and use it ! + # If not, the user is in trouble. + if size > cursor.file_size(): + size = cursor.file_size() - offset + # END handle size + self._size = size + else: + raise ValueError("Cursor %s not associated or mapping region failed!" % cursor) def __enter__(self): + assert self._entered >= 0, self._entered + self._entered += 1 return self def __exit__(self, exc_type, exc_value, traceback): - self.end_access() + assert self._entered >= 0, self._entered + self._entered -= 1 + if self._entered == 0: + self.close() + + def __del__(self): + if self._entered != 0: + log.warning("Missed %s exit(s) on %s!" % (self._entered, self)) + self.close() + + def _check_if_entered(self): + if self._entered <= 0: + raise ValueError('Context-manager %s not entered!' % self) def __len__(self): return self._size def __getitem__(self, i): + self._check_if_entered() + if isinstance(i, slice): return self.__getslice__(i.start or 0, i.stop or self._size) c = self._c @@ -73,6 +103,8 @@ def __getitem__(self, i): return c.buffer()[i - c.ofs_begin()] def __getslice__(self, i, j): + self._check_if_entered() + c = self._c # fast path, slice fully included - safes a concatenate operation and # should be the default @@ -124,44 +156,16 @@ def __getslice__(self, i, j): # END fast or slow path #{ Interface - def begin_access(self, cursor=None, offset=0, size=sys.maxsize, flags=0): - """Call this before the first use of this instance. The method was already - called by the constructor in case sufficient information was provided. - - For more information no the parameters, see the __init__ method - :param path: if cursor is None the existing one will be used. - :return: True if the buffer can be used""" - if cursor: - self._c = cursor - # END update our cursor - - # reuse existing cursors if possible - if self._c is not None and self._c.is_associated(): - res = self._c.use_region(offset, size, flags).is_valid() - if res: - # if given size is too large or default, we computer a proper size - # If its smaller, we assume the combination between offset and size - # as chosen by the user is correct and use it ! - # If not, the user is in trouble. - if size > self._c.file_size(): - size = self._c.file_size() - offset - # END handle size - self._size = size - # END set size - return res - # END use our cursor - return False - - def end_access(self): + def close(self): """Call this method once you are done using the instance. It is automatically called on destruction, and should be called just in time to allow system resources to be freed. - Once you called end_access, you must call begin access before reusing this instance!""" - self._size = 0 - if self._c is not None: + Once you called close, you must call begin access before reusing this instance!""" + if self._c: self._c.unuse_region() - # END unuse region + self._c = None + self._size = 0 def cursor(self): """:return: the currently set cursor which provides access to the data""" diff --git a/smmap/mman.py b/smmap/mman.py index e4d0bbd..8f9821a 100644 --- a/smmap/mman.py +++ b/smmap/mman.py @@ -359,6 +359,11 @@ def __exit__(self, exc_type, exc_value, traceback): if leaft_overs: log.debug("Cleaned up %s left-over mmap-regions." % leaft_overs) + def __del__(self): + if self._entered != 0: + log.warning("Missed %s exit(s) on %s!" % (self._entered, self)) + self.close() + def close(self): self.collect() diff --git a/smmap/test/test_buf.py b/smmap/test/test_buf.py index fe2ce82..39043f6 100644 --- a/smmap/test/test_buf.py +++ b/smmap/test/test_buf.py @@ -32,25 +32,21 @@ def test_basics(self): self.assertRaises(ValueError, SlidingWindowMapBuffer, type(c)()) # invalid cursor self.assertRaises(ValueError, SlidingWindowMapBuffer, c, fc.size) # offset too large - with SlidingWindowMapBuffer() as buf: # can create uninitailized buffers - assert buf.cursor() is None - - # can call end access any time - buf.end_access() - buf.end_access() - self.assertEqual(len(buf), 0) - - # begin access can revive it, if the offset is suitable - offset = 100 - self.assertEqual(buf.begin_access(c, fc.size), False) - self.assertEqual(buf.begin_access(c, offset), True) + offset = 100 + with SlidingWindowMapBuffer(c, offset) as buf: + assert buf.cursor() + assert buf.cursor().is_valid() self.assertEqual(len(buf), fc.size - offset) + + with SlidingWindowMapBuffer(c, fc.size - offset) as buf: + assert buf.cursor() assert buf.cursor().is_valid() + self.assertEqual(len(buf), offset) - # empty begin access keeps it valid on the same path, but alters the offset - self.assertEqual(buf.begin_access(), True) - self.assertEqual(len(buf), fc.size) + with SlidingWindowMapBuffer(c) as buf: + assert buf.cursor() assert buf.cursor().is_valid() + self.assertEqual(len(buf), fc.size) # simple access with open(fc.path, 'rb') as fp: @@ -61,14 +57,10 @@ def test_basics(self): # negative indices, partial slices self.assertEqual(buf[-1], buf[len(buf) - 1]) self.assertEqual(buf[-10:], buf[len(buf) - 10:len(buf)]) - - # end access makes its cursor invalid - buf.end_access() - assert not buf.cursor().is_valid() - assert buf.cursor().is_associated() # but it remains associated - - # an empty begin access fixes it up again - self.assertEqual(buf.begin_access(), True and buf.cursor().is_valid()) + # end access makes its cursor invalid + assert not buf.cursor() + assert not c.is_valid() + assert c.is_associated() # but it remains associated self.assertEqual(man_optimal.num_file_handles(), 1) @@ -90,15 +82,14 @@ def test_performance(self): (man_worst_case, 'worst case'), (static_man, 'static optimal')): with manager: - with SlidingWindowMapBuffer(manager.make_cursor(item)) as buf: - self.assertEqual(manager.num_file_handles(), 1) - for access_mode in range(2): # single, multi + for access_mode in range(2): # single, multi + with SlidingWindowMapBuffer(manager.make_cursor(item)) as buf: + self.assertEqual(manager.num_file_handles(), 1) num_accesses_left = max_num_accesses num_bytes = 0 fsize = fc.size st = time() - buf.begin_access() while num_accesses_left: num_accesses_left -= 1 if access_mode: # multi @@ -115,18 +106,16 @@ def test_performance(self): num_bytes += 1 # END handle mode # END handle num accesses - - buf.end_access() - assert manager.num_file_handles() - assert manager.collect() - self.assertEqual(manager.num_file_handles(), 0) - elapsed = max(time() - st, 0.001) # prevent zero division errors on windows - mb = float(1000 * 1000) - mode_str = (access_mode and "slice") or "single byte" - print("%s: Made %i random %s accesses to buffer created from %s " - "reading a total of %f mb in %f s (%f mb/s)" - % (man_id, max_num_accesses, mode_str, type(item), - num_bytes / mb, elapsed, (num_bytes / mb) / elapsed), - file=sys.stderr) + assert manager.num_file_handles() + assert manager.collect() + self.assertEqual(manager.num_file_handles(), 0) + elapsed = max(time() - st, 0.001) # prevent zero division errors on windows + mb = float(1000 * 1000) + mode_str = (access_mode and "slice") or "single byte" + print("%s: Made %i random %s accesses to buffer created from %s " + "reading a total of %f mb in %f s (%f mb/s)" + % (man_id, max_num_accesses, mode_str, type(item), + num_bytes / mb, elapsed, (num_bytes / mb) / elapsed), + file=sys.stderr) finally: os.close(fd) diff --git a/smmap/test/test_tutorial.py b/smmap/test/test_tutorial.py index cfbc29f..0adec5d 100644 --- a/smmap/test/test_tutorial.py +++ b/smmap/test/test_tutorial.py @@ -74,8 +74,5 @@ def test_example(self): buf[-1] # access the last ten bytes on the file buf[-10:] # access the last ten bytes - # If you want to keep the instance between different accesses, use the - # dedicated methods - buf.end_access() - assert not buf.cursor().is_valid() # you cannot use the buffer anymore - assert buf.begin_access(offset=10) # start using the buffer at an offset + assert not buf.cursor() + assert not c.is_valid() # you cannot use the buffer anymore From d81dc1da26227b0e084438040ec114db48b25de5 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Wed, 26 Oct 2016 00:24:50 +0200 Subject: [PATCH 17/19] style(mman): move managed_mmaps() closer to 2 mmans --- smmap/mman.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/smmap/mman.py b/smmap/mman.py index 8f9821a..1478836 100644 --- a/smmap/mman.py +++ b/smmap/mman.py @@ -21,25 +21,6 @@ #}END utilities -def managed_mmaps(check_entered=True): - """Makes a memory-map context-manager instance for the correct python-version. - - :param bool check_entered: - whether to scream if not used as context-manager (`with` block) - :return: - either :class:`SlidingWindowMapManager` or :class:`StaticWindowMapManager` (if PY2) - - If you want to change other default parameters of these classes, use them directly. - - .. Tip:: - Use it in a ``with ...:`` block, to free cached (and unused) resources. - - """ - mman = SlidingWindowMapManager if PY3 else StaticWindowMapManager - - return mman(check_entered=check_entered) - - class WindowCursor(object): """ @@ -263,6 +244,25 @@ def fd(self): #} END interface +def managed_mmaps(check_entered=True): + """Makes a memory-map context-manager instance for the correct python-version. + + :param bool check_entered: + whether to scream if not used as context-manager (`with` block) + :return: + either :class:`SlidingWindowMapManager` or :class:`StaticWindowMapManager` (if PY2) + + If you want to change other default parameters of these classes, use them directly. + + .. Tip:: + Use it in a ``with ...:`` block, to free cached (and unused) resources. + + """ + mman = SlidingWindowMapManager if PY3 else StaticWindowMapManager + + return mman(check_entered=check_entered) + + class StaticWindowMapManager(object): """Provides a manager which will produce single size cursors that are allowed From 9ba1649d8e987c2e1ad26a54cbfa9cfe5f3884bb Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Thu, 27 Oct 2016 11:31:13 +0200 Subject: [PATCH 18/19] fix(leaks): FIX memoryview leak in Windows + All gitdb TCs now pass without explit release! --- smmap/util.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/smmap/util.py b/smmap/util.py index 4cb6658..0628fc6 100644 --- a/smmap/util.py +++ b/smmap/util.py @@ -22,10 +22,9 @@ except NameError: # Python 3 has no `buffer`; only `memoryview` def buffer(obj, offset, size): - # Actually, for gitpython this is fastest ... . - return memoryview(obj)[offset:offset + size] - # doing it directly is much faster ! - # return obj[offset:offset + size] + # Actually, for gitpython this is fastest ... but `memoryviews` LEAK! + #return memoryview(obj)[offset:offset + size] + return obj[offset:offset + size] PY3 = sys.version_info[0] >= 3 From 144891bc6c3d5ecdd96e522cb4e3571dff948171 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Thu, 27 Oct 2016 11:25:09 +0200 Subject: [PATCH 19/19] chore(ver): bump 2.1.0.dev3-->2.1.0.dev4 --- doc/source/changes.rst | 2 ++ smmap/__init__.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/changes.rst b/doc/source/changes.rst index e6b5d7a..e279b63 100644 --- a/doc/source/changes.rst +++ b/doc/source/changes.rst @@ -15,6 +15,8 @@ Changelog Get them from ``smmap.managed_mmaps()``. +- FIX ``memoryview`` leak in Windows; now all *gitdb* TCs now pass without explicit release! + - Simplify :class:`SlidingWindowMapBuffer` as create/close context-manager (no ``begin_access()``, or ``end_access()``). diff --git a/smmap/__init__.py b/smmap/__init__.py index d1ba606..9f3e8eb 100644 --- a/smmap/__init__.py +++ b/smmap/__init__.py @@ -3,7 +3,7 @@ __author__ = "Sebastian Thiel" __contact__ = "byronimo@gmail.com" __homepage__ = "https://github.com/Byron/smmap" -version_info = (2, 1, 0, 'dev3') +version_info = (2, 1, 0, 'dev4') __version__ = '.'.join(str(i) for i in version_info) # make everything available in root package for convenience