From e56fb97cd673a52d02b68eebd28372f782750aa8 Mon Sep 17 00:00:00 2001 From: Thomas A Caswell Date: Thu, 3 Nov 2022 12:49:22 -0400 Subject: [PATCH 1/8] DOC: write out a prose narrative of the proposed design --- data_prototype/wrappers.py | 5 +- docs/source/conf.py | 7 +- docs/source/design.rst | 241 +++++++++++++++++++++++++++++++++++++ docs/source/index.rst | 16 +++ 4 files changed, 267 insertions(+), 2 deletions(-) create mode 100644 docs/source/design.rst diff --git a/data_prototype/wrappers.py b/data_prototype/wrappers.py index 059e507..1452a2f 100644 --- a/data_prototype/wrappers.py +++ b/data_prototype/wrappers.py @@ -126,6 +126,7 @@ def _query_and_transform(self, renderer, *, xunits: List[str], yunits: List[str] # actually query the underlying data. This returns both the (raw) data # and key to use for caching. bb_size = ax_bbox.size + # Step 1 data, cache_key = self.data.query( # TODO do this needs to be (de) unitized # TODO figure out why caching this did not work @@ -138,7 +139,8 @@ def _query_and_transform(self, renderer, *, xunits: List[str], yunits: List[str] return self._cache[cache_key] except KeyError: ... - # TODO decide if units go pre-nu or post-nu? + + # Step 2 for x_like in xunits: if x_like in data: data[x_like] = ax.xaxis.convert_units(data[x_like]) @@ -146,6 +148,7 @@ def _query_and_transform(self, renderer, *, xunits: List[str], yunits: List[str] if y_like in data: data[y_like] = ax.xaxis.convert_units(data[y_like]) + # Step 3 # doing the nu work here is nice because we can write it once, but we # really want to push this computation down a layer # TODO sort out how this interoperates with the transform stack diff --git a/docs/source/conf.py b/docs/source/conf.py index 03e060d..091beab 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -55,6 +55,7 @@ plot_html_show_source_link = False plot_html_show_formats = False +hmathjax_path = "https://cdn.jsdelivr.net/npm/mathjax@2/MathJax.js?config=TeX-AMS-MML_HTMLorMML" # Generate the API documentation when building autosummary_generate = False @@ -157,7 +158,7 @@ def matplotlib_reduced_latex_scraper(block, block_vars, gallery_conf, **kwargs): # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = "mpl_sphinx_theme" +# html_theme = "mpl_sphinx_theme" # Theme options are theme-specific and customize the look and feel of a theme @@ -250,4 +251,8 @@ def matplotlib_reduced_latex_scraper(block, block_vars, gallery_conf, **kwargs): "scipy": ("https://docs.scipy.org/doc/scipy/reference/", None), "pandas": ("https://pandas.pydata.org/pandas-docs/stable", None), "matplotlib": ("https://matplotlib.org/stable", None), + "networkx": ("https://networkx.org/documentation/stable", None), } + + +default_role = 'obj' diff --git a/docs/source/design.rst b/docs/source/design.rst new file mode 100644 index 0000000..20c3c82 --- /dev/null +++ b/docs/source/design.rst @@ -0,0 +1,241 @@ +======== + Design +======== + +When a Matplotlib :obj:`~matplotlib.artist.Artist` object in rendered via the `~matplotlib.artist.Artist.draw` method the following +steps happen (in spirit but maybe not exactly in code): + +1. get the data +2. convert from unit-full to unit-less data +3. convert the unit-less data from user-space to rendering-space +4. call the backend rendering functions + +.. + If we were to call these steps :math:`f_1` through :math:`f_4` this can be expressed as (taking + great liberties with the mathematical notation): + + .. math:: + + R = f_4(f_3(f_2(f_1()))) + + or if you prefer + + .. math:: + + R = (f_4 \circ f_3 \circ f_2 \circ f_1)() + + It is reasonable that if we can do this for one ``Artist``, we can build up + more complex visualizations by rendering multiple ``Artist`` to the same + target. + +However, this clear structure is frequently elided and obscured in the +Matplotlib code base: Step 3 is only present for *x* and *y* like data (encoded +in the `~matplotlib.transforms.TransformNode` objects) and color mapped data +(implemented in the `.matplotlib.colors.ScalarMappable` family of classes); the +application of Step 2 is inconsistent (both in actual application and when it +is applied) between artists; each ``Artist`` stores it's data in its own way +(typically as numpy arrays). + +With this view, we can understand the `~matplotlib.artist.Artist.draw` methods +to be very extensively `curried +`__ version of +these function chains where the objects allow us to modify the arguments to the +functions. + +The goal of this work is to bring this structure more the foreground in the internal of +Matplotlib to make it easier to reason about, easier to extend, and easier to inject +custom logic at each of the steps + +A paper with the formal mathematical description of these ideas is in +preparation. + +Data pipeline +============= + +Get the data (Step 1) +--------------------- + +Currently, almost all ``Artist`` class store the data associated with them as +attributes on the instances as `numpy.array` objectss. On one hand, this can +be very useful as historically data was frequently already in `numpy.array` +objects and, if you know the right methods for *this* ``Artist`` you can access +that state to update or query it. From a certain point of view, this is +consistent with the scheme laid out above as ``self.x[:]`` is really +``self.x.__getitem__(slice())`` which is (technically) a function call. + +However, this has several drawbacks. In most cases the data attributes on an +``Artist`` are closely linked -- the *x* and *y* on a +`~matplotlib.lines.Line2D` must be the same length -- and by storing them +separately it is possible that they will get out of sync in problematic ways. +Further, because the data is stored as materialized ``numpy`` arrays, there we +must decide before draw time what the correct sampling of the data is. While +there are some projects like `grave `__ that wrap +richer objects or `mpl-modest-image +`__, `datashader +`__, +and `mpl-scatter-density `__ +that dynamically re-sample the data these are niche libraries. + +The first goal of this project is to bring support for draw-time resampleing to +every Matplotlib ``Artist`` out of the box. The current approach is to move +all of the data storage off of the ``Artist`` directly and into a (so-called) +`~data_prototype.containers.DataContainer` instance. The primary method on these objects +is the `~data_prototype.containers.DataContainer.query` method which has the signature :: + + def query( + self, + transform: _Transform, + size: Tuple[int, int], + ) -> Tuple[Dict[str, Any], Union[str, int]]: + +The query is passed in: + +- A transform from "Axes" to "data" (using Matplotlib's names for the `various + coordinate systems + `__ +- A notion of how big the axes is in "pixels" to provide guidance on what the correct number + of samples to return is. + +It will return: + +- A mapping of strings to things that is coercible (with the help of the + functions is steps 2 and 3) to a numpy array or types understandable by the + backends. +- A key that can be used for caching + +This function will be called at draw time by the ``Aritist`` to get the data to +be drawn. In the simplest cases +(e.g. `~data_prototype.containers.ArrayContainer` and +`~data_prototype.containers.DataFrameContainer`) the ``query`` method ignores +the input and returns the data as-is. However, based on these inputs it is +possible for the ``query`` method to get the data limits, even sampling in +screen space, and an approximate estimate of the resolution of the +visualization. This also opens up several interesting possibilities: + +1. "Pure function" containers (such as + `~data_prototype.containers.FuncContainer`) which will dynamically sample a + function at "a good resolution" for the current data limits and screen size. +2. A "resampling" container that either down-samples or slices the data it holds based on + the current view limits. +3. A container that makes a network or database call and automatically refreshes the data + as a function of time. +4. Containers that do binning or aggregation of the user data (such as + `~data_prototype.containers.HistContainer`). + +By accessing all of the data that is needed in draw in a single function call +the ``DataContainer`` instances can ensure that the data is coherent and +consistent. This is important for applications like steaming where different +parts of the data may be arriving at different rates and it would thus be the +``DataContainer``'s responsibility to settle any race conditions and always +return aligned data to the ``Artist``. + + +There is still some ambiguity as to what should be put in the data. For +example with `~matplotlib.lines.Line2D` it is clear that the *x* and *y* data +should be pulled from the ``DataConatiner``, but things like *color* and +*linewidth* are ambiguous. A later section will make the case that it should be +possible, but maybe not required, that these values be accessible in the data +context. + +An additional task that the ``DataContainer`` can do is to describe the type, +shape, fields, and topology of the data it contains. For example a +`~matplotlib.lines.Line2D` needs an *x* and *y* that are the same length, but +`~matplotlib.patches.StepPatch` (which is also a 2D line) needs a *x* that is +one longer than the *y*. The difference is that a ``Line2D`` in points with +values which can be continuously interpolated between and ``StepPatch`` is bin +edges with a constant value between the edges. This design lets us make +explicit the implicit encoding of this sort of distinction in Matplotlib and be +able to programatically operate on it. The details of exactly how to encode +all of this still needs to be developed. There is a +`~data_prototype.containers.DataContainer.describe` method, however it is the +most provisional part of the current design. + + +Unit conversion (Step 2) +------------------------ + +Real data almost always has some units attached to it. Historically, this +information can be carried "out of band" in the structure of the code or in +custom containers or data types that are unit-aware. The recent work on ``numpy`` to +make ``np.dtype`` more easily extendable is likely to make unit-full data much more +common and easier to work with in the future. + +In principle the user should be able to plot sets of data, one of them in *ft* +the other in *m* and then show the ticks in *in* and then switch to *cm* and +have everything "just work" for all plot types. Currently we are very far from +this due to some parts of the code eagerly converting to the unit-less +representation and not keeping the original, some parts of the code failing to +do the conversion at all, some parts doing the conversion after coercing to +``numpy`` and losing the unit information, etc. Further, because the data +access and processing pipeline is done differently in every ``Artist`` it is a +constant game of whack-a-bug to keep this working. If we adopt the consistent +``DataContainer`` model for accessing the data and call +`~data_prototype.containers.DataContainer.query` at draw time we will have a +consistent place to also do the unit conversion. + +The ``DataContainer`` can also carry inspectable information about what the +units of its data are in which would make it possible to do ahead-of-time +verification that the data of all of the ``Artists`` in an ``Axes`` are +consistent with unit converters on the ``Axis``. + + +Convert for rendering (Step 3) +------------------------------ + +The next step is to get the data from unit-less "user data" into something that +the backend renderer understand. This can range from coordinate +transformations (as with the ``Transfrom`` stack operations on *x* and *y* like +values), representation conversions (like named colors to RGB values), mapping +stings to a set of objects (like named markershape), to paraaterized type +conversion (like colormapping). Although Matplotlib is currently doing all of +these conversions, the user really only has control of the position and +colormapping (on `~matplotlib.colors.ScalarMappable` sub-classes). The next +thing that this design allows is for user defined functions to be passed for +any of the relevant data fields. + +This will open up paths to do a number of nice things such as multi-variate +color maps, lines who's width and color vary along their length, constant but +parameterized colors and linestyles, and a version of ``scatter`` where the +marker shape depends on the data. All of these things are currently possible +in Matplotlib, but require significant work before calling Matplotlib and can +be very difficult to update after the fact. + +Pass to backend (Step 4) +------------------------ + +This part of the process is proposed to remain unchanged from current +Matplotlib. The calls to the underlying ``Renderer`` objects in ``draw`` +methods have stood the test of time and changing them is out of scope for the +current work. In the future we may want to consider eliding Steps 3 and 4 in +some cases for performance reasons to be able push the computation down to a +GPU. + + +Caching +======= + +A key to keeping this implementation efficient is to be able to cache when we +have to re-compute values. Internally current Matplotlib has a number of +ad-hoc caches, such as in ``ScalarMappable`` and ``Line2D``. Going down the +route of hashing all of the data is not a sustainable path (in the case even +modestly sized data the time to hash the data will quickly out-strip any +possible time savings doing the cache lookup!). The proposed ``query`` method +returns a cache key that it generates to the caller. The exact details of how +to generate that key are left to the ``DataContainer`` implementation, but if +the returned data changed, then the cache key must change. The cache key +should be computed from a combination of the ``DataContainers`` internal state, +the transform and size passed in. + +The choice to return the data and cache key in one step, rather than be a two +step process is drive by simplicity and because the cache key is computed +inside of the ``query`` call. If computing the cache key is fast and the data +to be returned in "reasonable" for the machine Matplotlib is running on (it +needs to be or we won't render!), then if it makes sense to cache the results +it can be done by the ``DataContainer`` and returned straight away along with +the computed key. + +There will need to be some thought put into cache invalidation and size +management at the ``Artist`` layer. We also need to determine how many cache +layers to keep. Currently only the results of Step 3 are cached, but we may +want to additionally cache intermediate results after Step 2. The caching from +Step 1 is likely best left to the ``DataContainer`` instances. diff --git a/docs/source/index.rst b/docs/source/index.rst index 1c91676..60eb1e2 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -12,6 +12,14 @@ repository should be considered experimental and used at you own risk. Source : https://github.com/matplotlib/data-prototype +Design +------ +.. toctree:: + :maxdepth: 2 + + design.rst + + Examples -------- @@ -19,6 +27,14 @@ Examples :maxdepth: 2 gallery/index.rst + +Reference +--------- + + +.. toctree:: + :maxdepth: 2 + api/index.rst Backmatter From 7b084b23ddb7cf18d8ffde7982ea8f232844300c Mon Sep 17 00:00:00 2001 From: Thomas A Caswell Date: Thu, 3 Nov 2022 13:29:58 -0400 Subject: [PATCH 2/8] DOC: link to correct Curry concept on wiki Co-authored-by: Kyle Sunden --- docs/source/design.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/design.rst b/docs/source/design.rst index 20c3c82..def4f06 100644 --- a/docs/source/design.rst +++ b/docs/source/design.rst @@ -38,7 +38,7 @@ is applied) between artists; each ``Artist`` stores it's data in its own way With this view, we can understand the `~matplotlib.artist.Artist.draw` methods to be very extensively `curried -`__ version of +`__ version of these function chains where the objects allow us to modify the arguments to the functions. From 3a0faf4ffc4fb0f76389b19eb55278e45013b4f7 Mon Sep 17 00:00:00 2001 From: Thomas A Caswell Date: Thu, 3 Nov 2022 13:38:37 -0400 Subject: [PATCH 3/8] STY: fix formatting in sphinx configuration --- docs/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 091beab..3b57575 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -255,4 +255,4 @@ def matplotlib_reduced_latex_scraper(block, block_vars, gallery_conf, **kwargs): } -default_role = 'obj' +default_role = "obj" From 6749728c264a04a4181f3cfce2d1dbef35a3cdbe Mon Sep 17 00:00:00 2001 From: Thomas A Caswell Date: Tue, 15 Nov 2022 17:57:44 -0500 Subject: [PATCH 4/8] FIX: use correct axis for units --- data_prototype/wrappers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_prototype/wrappers.py b/data_prototype/wrappers.py index 1452a2f..390527b 100644 --- a/data_prototype/wrappers.py +++ b/data_prototype/wrappers.py @@ -146,7 +146,7 @@ def _query_and_transform(self, renderer, *, xunits: List[str], yunits: List[str] data[x_like] = ax.xaxis.convert_units(data[x_like]) for y_like in yunits: if y_like in data: - data[y_like] = ax.xaxis.convert_units(data[y_like]) + data[y_like] = ax.yaxis.convert_units(data[y_like]) # Step 3 # doing the nu work here is nice because we can write it once, but we From 771e6829a604def81467903b2df27331d3541593 Mon Sep 17 00:00:00 2001 From: Thomas A Caswell Date: Mon, 14 Nov 2022 23:15:57 -0500 Subject: [PATCH 5/8] DOC: an editing pass at the prose --- docs/source/design.rst | 114 ++++++++++++++++++++++++----------------- 1 file changed, 67 insertions(+), 47 deletions(-) diff --git a/docs/source/design.rst b/docs/source/design.rst index def4f06..80281a7 100644 --- a/docs/source/design.rst +++ b/docs/source/design.rst @@ -2,8 +2,10 @@ Design ======== -When a Matplotlib :obj:`~matplotlib.artist.Artist` object in rendered via the `~matplotlib.artist.Artist.draw` method the following -steps happen (in spirit but maybe not exactly in code): + +When a Matplotlib :obj:`~matplotlib.artist.Artist` object in rendered via the +`~matplotlib.artist.Artist.draw` method the following steps happen (in spirit +but maybe not exactly in code): 1. get the data 2. convert from unit-full to unit-less data @@ -29,22 +31,23 @@ steps happen (in spirit but maybe not exactly in code): target. However, this clear structure is frequently elided and obscured in the -Matplotlib code base: Step 3 is only present for *x* and *y* like data (encoded -in the `~matplotlib.transforms.TransformNode` objects) and color mapped data -(implemented in the `.matplotlib.colors.ScalarMappable` family of classes); the -application of Step 2 is inconsistent (both in actual application and when it -is applied) between artists; each ``Artist`` stores it's data in its own way -(typically as numpy arrays). +Matplotlib code base: Step 3 is only present for *x* and *y* like data +(encapsulated in the `~matplotlib.transforms.TransformNode` objects) and color +mapped data (encapsulated in the `.matplotlib.colors.ScalarMappable` family of +classes); the application of Step 2 is inconsistent (both in actual application +and when it is applied) between artists; each ``Artist`` stores its data in +its own way (typically as numpy arrays). With this view, we can understand the `~matplotlib.artist.Artist.draw` methods -to be very extensively `curried -`__ version of -these function chains where the objects allow us to modify the arguments to the -functions. +to be very extensively `curried `__ +version of these function chains where the objects allow us to modify the +arguments to the functions and the re-run them. -The goal of this work is to bring this structure more the foreground in the internal of -Matplotlib to make it easier to reason about, easier to extend, and easier to inject -custom logic at each of the steps +The goal of this work is to bring this structure more to the foreground in the +internal structure of Matplotlib. By exposing this inherent structure +uniformity in the architecture of Matplotlib the library will be easier to +reason about and easier to extend by injecting custom logic at each of +the steps A paper with the formal mathematical description of these ideas is in preparation. @@ -55,55 +58,66 @@ Data pipeline Get the data (Step 1) --------------------- -Currently, almost all ``Artist`` class store the data associated with them as -attributes on the instances as `numpy.array` objectss. On one hand, this can -be very useful as historically data was frequently already in `numpy.array` -objects and, if you know the right methods for *this* ``Artist`` you can access -that state to update or query it. From a certain point of view, this is -consistent with the scheme laid out above as ``self.x[:]`` is really -``self.x.__getitem__(slice())`` which is (technically) a function call. - -However, this has several drawbacks. In most cases the data attributes on an -``Artist`` are closely linked -- the *x* and *y* on a +In this context "data" is post any data-to-data transformations or +aggregations. There is already extensive tooling and literature around that +aspect. By completely decoupling the aggregations pipeline from the +visualization process we are able to both simplify and generalize the problem. + +Currently, almost all ``Artist`` classes store the data they are representing +as attributes on the instances as realized `numpy.array` [#]_ objects. On one +hand, this can be very useful as historically data was frequently already in +`numpy.array` objects in the users' namespace. If you know the right methods +for *this* ``Artist``, you can query or update the data without recreating the +Artist. This is technically consistent with the scheme laid out above if we +understand ``self.x[:]`` as ``self.x.__getitem__(slice())`` which is a function +call. + +However, this method of storing the data has several drawbacks. In most cases +the data attributes on an ``Artist`` are closely linked -- the *x* and *y* on a `~matplotlib.lines.Line2D` must be the same length -- and by storing them -separately it is possible that they will get out of sync in problematic ways. -Further, because the data is stored as materialized ``numpy`` arrays, there we -must decide before draw time what the correct sampling of the data is. While -there are some projects like `grave `__ that wrap -richer objects or `mpl-modest-image +separately it is possible for them to become inconsistent in ways that noticed +until draw time [#]_. Further, because the data is stored as materialized +``numpy`` arrays, we must decide before draw time what the correct sampling of +the data is. While there are some projects like `grave `__ that wrap richer objects or `mpl-modest-image `__, `datashader `__, and `mpl-scatter-density `__ -that dynamically re-sample the data these are niche libraries. +that dynamically re-sample the data, these libraries have had only limited +adoption. -The first goal of this project is to bring support for draw-time resampleing to -every Matplotlib ``Artist`` out of the box. The current approach is to move -all of the data storage off of the ``Artist`` directly and into a (so-called) -`~data_prototype.containers.DataContainer` instance. The primary method on these objects -is the `~data_prototype.containers.DataContainer.query` method which has the signature :: +The first goal of this project is to bring support for draw-time resampling to +every Matplotlib ``Artist``. The proposed approach is to move the data storage +of the ``Artist`` to be indirectly via a (so-called) +`~data_prototype.containers.DataContainer` instance rather than directly. The +primary method on these objects is the +`~data_prototype.containers.DataContainer.query` method which has the signature +:: def query( self, - transform: _Transform, + /, + coord_transform: _MatplotlibTransform, size: Tuple[int, int], ) -> Tuple[Dict[str, Any], Union[str, int]]: The query is passed in: -- A transform from "Axes" to "data" (using Matplotlib's names for the `various - coordinate systems - `__ -- A notion of how big the axes is in "pixels" to provide guidance on what the correct number - of samples to return is. +- A *coord_transform* from "Axes fraction" to "data" (using Matplotlib's names + for the `coordinate systems + `__) +- A notion of how big the axes is in "pixels" to provide guidance on what the + correct number of samples to return is. For raster outputs this is literal + pixels but for vector backends it will have to be an effective resolution. It will return: -- A mapping of strings to things that is coercible (with the help of the +- A mapping of strings to things that are coercible (with the help of the functions is steps 2 and 3) to a numpy array or types understandable by the backends. - A key that can be used for caching -This function will be called at draw time by the ``Aritist`` to get the data to +This function will be called at draw time by the ``Artist`` to get the data to be drawn. In the simplest cases (e.g. `~data_prototype.containers.ArrayContainer` and `~data_prototype.containers.DataFrameContainer`) the ``query`` method ignores @@ -124,7 +138,7 @@ visualization. This also opens up several interesting possibilities: By accessing all of the data that is needed in draw in a single function call the ``DataContainer`` instances can ensure that the data is coherent and -consistent. This is important for applications like steaming where different +consistent. This is important for applications like streaming where different parts of the data may be arriving at different rates and it would thus be the ``DataContainer``'s responsibility to settle any race conditions and always return aligned data to the ``Artist``. @@ -132,7 +146,7 @@ return aligned data to the ``Artist``. There is still some ambiguity as to what should be put in the data. For example with `~matplotlib.lines.Line2D` it is clear that the *x* and *y* data -should be pulled from the ``DataConatiner``, but things like *color* and +should be pulled from the ``DataContiner``, but things like *color* and *linewidth* are ambiguous. A later section will make the case that it should be possible, but maybe not required, that these values be accessible in the data context. @@ -224,7 +238,7 @@ returns a cache key that it generates to the caller. The exact details of how to generate that key are left to the ``DataContainer`` implementation, but if the returned data changed, then the cache key must change. The cache key should be computed from a combination of the ``DataContainers`` internal state, -the transform and size passed in. +the coordinate transformation and size passed in. The choice to return the data and cache key in one step, rather than be a two step process is drive by simplicity and because the cache key is computed @@ -239,3 +253,9 @@ management at the ``Artist`` layer. We also need to determine how many cache layers to keep. Currently only the results of Step 3 are cached, but we may want to additionally cache intermediate results after Step 2. The caching from Step 1 is likely best left to the ``DataContainer`` instances. + +.. [#] Not strictly true, in some cases we also store the values in the data in + the container it came in with which may not be a `numpy.array`. +.. [#] For example `matplotlib.lines.Line2D.set_xdata` and + `matplotlib.lines.Line2D.set_ydata` do not check the lengths of the + input at call time. From 38b394c7ba1e2b5105fa681ade8df5a553004a18 Mon Sep 17 00:00:00 2001 From: Thomas A Caswell Date: Tue, 15 Nov 2022 18:25:23 -0500 Subject: [PATCH 6/8] DOC: more editing --- docs/source/design.rst | 62 +++++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/docs/source/design.rst b/docs/source/design.rst index 80281a7..960d59d 100644 --- a/docs/source/design.rst +++ b/docs/source/design.rst @@ -44,10 +44,9 @@ version of these function chains where the objects allow us to modify the arguments to the functions and the re-run them. The goal of this work is to bring this structure more to the foreground in the -internal structure of Matplotlib. By exposing this inherent structure -uniformity in the architecture of Matplotlib the library will be easier to -reason about and easier to extend by injecting custom logic at each of -the steps +internal structure of Matplotlib. By exposing this inherent structure in the +architecture of Matplotlib the library will be easier to reason about and +easier to extend by injecting custom logic at each of the steps. A paper with the formal mathematical description of these ideas is in preparation. @@ -58,41 +57,48 @@ Data pipeline Get the data (Step 1) --------------------- -In this context "data" is post any data-to-data transformations or -aggregations. There is already extensive tooling and literature around that -aspect. By completely decoupling the aggregations pipeline from the -visualization process we are able to both simplify and generalize the problem. +In this context "data" is post any data-to-data transformations or aggregation +steps. There is already extensive tooling and literature around that aspect +which we do not need to recreate. By completely decoupling the aggregations +pipeline from the visualization process we are able to both simplify and +generalize the software. Currently, almost all ``Artist`` classes store the data they are representing as attributes on the instances as realized `numpy.array` [#]_ objects. On one -hand, this can be very useful as historically data was frequently already in +hand, this can be very convenient as data is frequently already in `numpy.array` objects in the users' namespace. If you know the right methods for *this* ``Artist``, you can query or update the data without recreating the -Artist. This is technically consistent with the scheme laid out above if we -understand ``self.x[:]`` as ``self.x.__getitem__(slice())`` which is a function -call. - -However, this method of storing the data has several drawbacks. In most cases -the data attributes on an ``Artist`` are closely linked -- the *x* and *y* on a -`~matplotlib.lines.Line2D` must be the same length -- and by storing them -separately it is possible for them to become inconsistent in ways that noticed -until draw time [#]_. Further, because the data is stored as materialized +``Artist``. This is technically consistent with the scheme laid out above if +we understand ``self.x[:]`` as ``self.x.__getitem__(slice())`` which is the +function call in step 1. + +However, this method of storing the data has several drawbacks. + +In most cases the data attributes on an ``Artist`` are closely linked -- the +*x* and *y* on a `~matplotlib.lines.Line2D` must be the same length -- and by +storing them separately it is possible for them to become inconsistent in ways +that noticed until draw time [#]_. With the rise of more structured data, such +as ``pandas.DataFrame`` and ``xarray.Dataset`` users are more frequently having +their data is coherent objects rather than individual arrays. Currently +Matplotlib requires that these structures be decomposed and losing the +association between the individual arrays. + +An goal of this project is to bring support for draw-time resampling to every +Matplotlib ``Artist``. Further, because the data is stored as materialized ``numpy`` arrays, we must decide before draw time what the correct sampling of -the data is. While there are some projects like `grave `__ that wrap richer objects or `mpl-modest-image +the data is. Projects like `grave `__ that wrap +richer objects or `mpl-modest-image `__, `datashader `__, and `mpl-scatter-density `__ -that dynamically re-sample the data, these libraries have had only limited +that dynamically re-sample the data do exist, but they have only seen limited adoption. -The first goal of this project is to bring support for draw-time resampling to -every Matplotlib ``Artist``. The proposed approach is to move the data storage -of the ``Artist`` to be indirectly via a (so-called) -`~data_prototype.containers.DataContainer` instance rather than directly. The -primary method on these objects is the -`~data_prototype.containers.DataContainer.query` method which has the signature -:: +This is a proposal to add a level of indirection the data storage -- via a +(so-called) `~data_prototype.containers.DataContainer` -- rather than directly +as individual numpy arrays on the ``Artist`` instances. The primary method on +these objects is the `~data_prototype.containers.DataContainer.query` method +which has the signature :: def query( self, From 675ff3c0d4679347a6c7834470da4f8afe35eb7a Mon Sep 17 00:00:00 2001 From: Thomas A Caswell Date: Wed, 23 Nov 2022 14:58:25 -0500 Subject: [PATCH 7/8] WIP: more editing work --- docs/source/design/containers.rst | 42 ++++++ docs/source/{design.rst => design/index.rst} | 146 +++++++++++-------- docs/source/index.rst | 2 +- 3 files changed, 126 insertions(+), 64 deletions(-) create mode 100644 docs/source/design/containers.rst rename docs/source/{design.rst => design/index.rst} (68%) diff --git a/docs/source/design/containers.rst b/docs/source/design/containers.rst new file mode 100644 index 0000000..5844d39 --- /dev/null +++ b/docs/source/design/containers.rst @@ -0,0 +1,42 @@ +Containers Design Choices +========================= + +``Callable`` vs ``class`` +------------------------- + +In the mathematical formulation we model the date source as a function, however +in the implementation this has been promoted to a full class with two methods +of ``query`` and ``describe``. + +The justification for this is: + +1. We anticipate the need to update the data held by the container so will + likely be backed by instances of classes in practice +2. We will want the containers to provide a static type and shape information + about itself. In principle this could be carried in the signature of the + function, but Python's built in type system is too dynamic for this to be + practical. + +A `collections.SimpleNamespace` with the correct names is compatible with this API. + + +``obj.__call__`` and ``obj.describe`` would make the data feel more like a +function, however if someone wanted to implement this with a function rather +than a class it would require putting a callable as an attribute on a callable. +This is technically allowed in Python, but a bit weird. + +Caching design +-------------- + +.. note:: + + There are two hard problems in computer science: + + 1. naming things + 2. cache invalidation + 3. off-by-one bugs + +Because we are adding a layer of indirection to the data access it is no longer +generally true that "getting the data" is cheap nor that any layer of the +system has all of the information required to know if cached values are still +valid. diff --git a/docs/source/design.rst b/docs/source/design/index.rst similarity index 68% rename from docs/source/design.rst rename to docs/source/design/index.rst index 960d59d..1185f6f 100644 --- a/docs/source/design.rst +++ b/docs/source/design/index.rst @@ -3,6 +3,10 @@ ======== +Introduction +============ + + When a Matplotlib :obj:`~matplotlib.artist.Artist` object in rendered via the `~matplotlib.artist.Artist.draw` method the following steps happen (in spirit but maybe not exactly in code): @@ -12,41 +16,49 @@ but maybe not exactly in code): 3. convert the unit-less data from user-space to rendering-space 4. call the backend rendering functions -.. - If we were to call these steps :math:`f_1` through :math:`f_4` this can be expressed as (taking - great liberties with the mathematical notation): +If we were to call these steps :math:`f_1` through :math:`f_4` this can be expressed as (taking +great liberties with the mathematical notation): + +.. math:: - .. math:: + R = f_4(f_3(f_2(f_1()))) - R = f_4(f_3(f_2(f_1()))) +or if you prefer - or if you prefer +.. math:: - .. math:: + R = (f_4 \circ f_3 \circ f_2 \circ f_1)() - R = (f_4 \circ f_3 \circ f_2 \circ f_1)() +If we can do this for one ``Artist``, we can build up more complex +visualizations via composition by rendering multiple ``Artist`` to the +same target. - It is reasonable that if we can do this for one ``Artist``, we can build up - more complex visualizations by rendering multiple ``Artist`` to the same - target. +We can understand the :obj:`~matplotlib.artist.Artist.draw` methods to be +extensively `curried `__ version of +these function chains. By wrapping the functions in objects we can modify the +bound arguments to the functions. However, the clear structure is frequently +elided or obscured in the Matplotlib code base and there is an artificial +distinction between "data" and "style" inputs. -However, this clear structure is frequently elided and obscured in the -Matplotlib code base: Step 3 is only present for *x* and *y* like data -(encapsulated in the `~matplotlib.transforms.TransformNode` objects) and color -mapped data (encapsulated in the `.matplotlib.colors.ScalarMappable` family of -classes); the application of Step 2 is inconsistent (both in actual application -and when it is applied) between artists; each ``Artist`` stores its data in -its own way (typically as numpy arrays). +For example mapping from "user data" to "rendering data" (Step 3) is only done +at draw-time for *x* / *y* like data (encapsulated in the +`~matplotlib.transforms.TransformNode` objects) and color mapped data +(encapsulated in the `~matplotlib.cm.ScalarMappable` family of classes). +If users need to do any other mapping between their data and Matplotlib's +rendering space, it must be done in user code and the results passed into +Matplotlib. The application of unit conversion (Step 2) is inconsistent (both +in actual application and when it is applied) between artists. This is a +particular difficulty for ``Artists`` parameterized by deltas (e.g. *height* +and *width* for a Rectangle) where the order of unit conversion and computing +the absolute bounding box can be fraught. Finally, each ``Artist`` stores its +data in its own way (typically as materialized numpy arrays) which makes it +difficult to update artists in a uniform way. -With this view, we can understand the `~matplotlib.artist.Artist.draw` methods -to be very extensively `curried `__ -version of these function chains where the objects allow us to modify the -arguments to the functions and the re-run them. The goal of this work is to bring this structure more to the foreground in the internal structure of Matplotlib. By exposing this inherent structure in the architecture of Matplotlib the library will be easier to reason about and -easier to extend by injecting custom logic at each of the steps. +easier to extend. A paper with the formal mathematical description of these ideas is in preparation. @@ -57,11 +69,12 @@ Data pipeline Get the data (Step 1) --------------------- -In this context "data" is post any data-to-data transformations or aggregation -steps. There is already extensive tooling and literature around that aspect -which we do not need to recreate. By completely decoupling the aggregations -pipeline from the visualization process we are able to both simplify and -generalize the software. +.. note :: + + In this context "data" is post any data-to-data transformation or aggregation + steps. Because this proposal holds a function, rather than materialized + arrays, we can defer actually executing the data pipeline until draw time, + but Matplotlib does not need an visibility into what this pipeline is. Currently, almost all ``Artist`` classes store the data they are representing as attributes on the instances as realized `numpy.array` [#]_ objects. On one @@ -72,33 +85,29 @@ for *this* ``Artist``, you can query or update the data without recreating the we understand ``self.x[:]`` as ``self.x.__getitem__(slice())`` which is the function call in step 1. -However, this method of storing the data has several drawbacks. - -In most cases the data attributes on an ``Artist`` are closely linked -- the -*x* and *y* on a `~matplotlib.lines.Line2D` must be the same length -- and by -storing them separately it is possible for them to become inconsistent in ways -that noticed until draw time [#]_. With the rise of more structured data, such -as ``pandas.DataFrame`` and ``xarray.Dataset`` users are more frequently having -their data is coherent objects rather than individual arrays. Currently +However, this method of storing the data has several drawbacks. In most cases +the data attributes on an ``Artist`` are closely linked -- the *x* and *y* on a +`~matplotlib.lines.Line2D` must be the same length -- and by storing them +separately it is possible for them to become inconsistent in ways that noticed +until draw time [#]_. With the rise of more structured data types, such as +`pandas.DataFrame` and `xarray.core.dataset.Dataset`, users are likely to have +their data in coherent objects rather than as individual arrays. Currently Matplotlib requires that these structures be decomposed and losing the -association between the individual arrays. - -An goal of this project is to bring support for draw-time resampling to every -Matplotlib ``Artist``. Further, because the data is stored as materialized -``numpy`` arrays, we must decide before draw time what the correct sampling of -the data is. Projects like `grave `__ that wrap -richer objects or `mpl-modest-image +association between the individual arrays. Further, because the data is stored +as materialized ``numpy`` arrays, we must decide before draw time what the +correct sampling of the data is. Projects like `grave `__ that wrap richer objects or `mpl-modest-image `__, `datashader `__, and `mpl-scatter-density `__ that dynamically re-sample the data do exist, but they have only seen limited adoption. -This is a proposal to add a level of indirection the data storage -- via a -(so-called) `~data_prototype.containers.DataContainer` -- rather than directly -as individual numpy arrays on the ``Artist`` instances. The primary method on -these objects is the `~data_prototype.containers.DataContainer.query` method -which has the signature :: +The first structural change of this proposal is to add a layer of indirection +-- via a (so-called) `~data_prototype.containers.DataContainer` -- to the data +storage and access. The primary method on these objects is the +`~data_prototype.containers.DataContainer.query` method with the signature +:: def query( self, @@ -107,7 +116,7 @@ which has the signature :: size: Tuple[int, int], ) -> Tuple[Dict[str, Any], Union[str, int]]: -The query is passed in: +The query is passed: - A *coord_transform* from "Axes fraction" to "data" (using Matplotlib's names for the `coordinate systems @@ -119,9 +128,9 @@ The query is passed in: It will return: - A mapping of strings to things that are coercible (with the help of the - functions is steps 2 and 3) to a numpy array or types understandable by the + functions in Steps 2 and 3) to a numpy array or types understandable by the backends. -- A key that can be used for caching +- A key that can be used for caching by the caller This function will be called at draw time by the ``Artist`` to get the data to be drawn. In the simplest cases @@ -153,9 +162,8 @@ return aligned data to the ``Artist``. There is still some ambiguity as to what should be put in the data. For example with `~matplotlib.lines.Line2D` it is clear that the *x* and *y* data should be pulled from the ``DataContiner``, but things like *color* and -*linewidth* are ambiguous. A later section will make the case that it should be -possible, but maybe not required, that these values be accessible in the data -context. +*linewidth* are ambiguous. It should be possible, but maybe not required, that +these values be derived from the data returned by the ``DataContainer``. An additional task that the ``DataContainer`` can do is to describe the type, shape, fields, and topology of the data it contains. For example a @@ -170,6 +178,7 @@ all of this still needs to be developed. There is a `~data_prototype.containers.DataContainer.describe` method, however it is the most provisional part of the current design. +This does not address how the ``DataContainer`` objects are generated in practice. Unit conversion (Step 2) ------------------------ @@ -209,7 +218,7 @@ values), representation conversions (like named colors to RGB values), mapping stings to a set of objects (like named markershape), to paraaterized type conversion (like colormapping). Although Matplotlib is currently doing all of these conversions, the user really only has control of the position and -colormapping (on `~matplotlib.colors.ScalarMappable` sub-classes). The next +colormapping (on `~matplotlib.cm.ScalarMappable` sub-classes). The next thing that this design allows is for user defined functions to be passed for any of the relevant data fields. @@ -237,14 +246,14 @@ Caching A key to keeping this implementation efficient is to be able to cache when we have to re-compute values. Internally current Matplotlib has a number of ad-hoc caches, such as in ``ScalarMappable`` and ``Line2D``. Going down the -route of hashing all of the data is not a sustainable path (in the case even -modestly sized data the time to hash the data will quickly out-strip any -possible time savings doing the cache lookup!). The proposed ``query`` method -returns a cache key that it generates to the caller. The exact details of how -to generate that key are left to the ``DataContainer`` implementation, but if -the returned data changed, then the cache key must change. The cache key -should be computed from a combination of the ``DataContainers`` internal state, -the coordinate transformation and size passed in. +route of hashing all of the data is not a sustainable path (even with modestly +sized data the time to hash the data will quickly out-strip any possible time +savings doing the cache lookup!). The proposed ``query`` method returns a +cache key that it generates to the caller. The exact details of how to +generate that key are left to the ``DataContainer`` implementation, but if the +returned data changed, then the cache key must change. The cache key should be +computed from a combination of the ``DataContainers`` internal state and the arguments +passed to ``query``. The choice to return the data and cache key in one step, rather than be a two step process is drive by simplicity and because the cache key is computed @@ -260,6 +269,17 @@ layers to keep. Currently only the results of Step 3 are cached, but we may want to additionally cache intermediate results after Step 2. The caching from Step 1 is likely best left to the ``DataContainer`` instances. +Detailed design notes +===================== + + +.. toctree:: + :maxdepth: 2 + + containers + + + .. [#] Not strictly true, in some cases we also store the values in the data in the container it came in with which may not be a `numpy.array`. .. [#] For example `matplotlib.lines.Line2D.set_xdata` and diff --git a/docs/source/index.rst b/docs/source/index.rst index 60eb1e2..e1b68bd 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -17,7 +17,7 @@ Design .. toctree:: :maxdepth: 2 - design.rst + design/index Examples From d62a29a8ba0037dbf33d54bececc50f4fb76ae7b Mon Sep 17 00:00:00 2001 From: Thomas A Caswell Date: Wed, 23 Nov 2022 14:58:43 -0500 Subject: [PATCH 8/8] CI: give up on single page html It is now too long. --- .github/workflows/docs.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 398e6bf..2d09e9b 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -19,11 +19,11 @@ jobs: - name: Install mpl-gui run: python -m pip install -v . - name: Build - run: make -Cdocs singlehtml + run: make -Cdocs html - name: Publish if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} uses: peaceiris/actions-gh-pages@v3 with: github_token: ${{ secrets.GITHUB_TOKEN }} - publish_dir: ./docs/build/singlehtml + publish_dir: ./docs/build/html force_orphan: true