Skip to content

Commit d1022c6

Browse files
authored
Merge pull request #50 from posit-dev/feat-get-api-text
feat: add internal functions to get condensed API docs and examples text
2 parents 78b5f94 + a6f2926 commit d1022c6

File tree

2 files changed

+294
-0
lines changed

2 files changed

+294
-0
lines changed

pointblank/_utils.py

+276
Original file line numberDiff line numberDiff line change
@@ -383,3 +383,279 @@ def _check_invalid_fields(fields: list[str], valid_fields: list[str]):
383383
for field in fields:
384384
if field not in valid_fields:
385385
raise ValueError(f"Invalid field: {field}")
386+
387+
388+
def get_api_details(module, exported_list):
389+
"""
390+
Retrieve the signatures and docstrings of the functions/classes in the exported list.
391+
392+
Parameters
393+
----------
394+
module : module
395+
The module from which to retrieve the functions/classes.
396+
exported_list : list
397+
A list of function/class names as strings.
398+
399+
Returns
400+
-------
401+
str
402+
A string containing the combined class name, signature, and docstring.
403+
"""
404+
api_text = ""
405+
406+
for fn in exported_list:
407+
408+
# Split the attribute path to handle nested attributes
409+
parts = fn.split(".")
410+
obj = module
411+
for part in parts:
412+
obj = getattr(obj, part)
413+
414+
# Get the name of the object
415+
obj_name = obj.__name__
416+
417+
# Get the function signature
418+
sig = inspect.signature(obj)
419+
420+
# Get the docstring
421+
doc = obj.__doc__
422+
423+
# Combine the class name, signature, and docstring
424+
api_text += f"{obj_name}{sig}\n{doc}\n\n"
425+
426+
return api_text
427+
428+
429+
def _get_api_text() -> str:
430+
"""
431+
Get the API documentation for the Pointblank library.
432+
433+
Returns
434+
-------
435+
str
436+
The API documentation for the Pointblank library.
437+
"""
438+
439+
import pointblank
440+
441+
sep_line = "-" * 70
442+
443+
api_text = (
444+
f"{sep_line}\nThis is the API documentation for the Pointblank library.\n{sep_line}\n\n"
445+
)
446+
447+
#
448+
# Lists of exported functions and methods in different families
449+
#
450+
451+
validate_exported = [
452+
"Validate",
453+
"Thresholds",
454+
"Schema",
455+
]
456+
457+
val_steps_exported = [
458+
"Validate.col_vals_gt",
459+
"Validate.col_vals_lt",
460+
"Validate.col_vals_ge",
461+
"Validate.col_vals_le",
462+
"Validate.col_vals_eq",
463+
"Validate.col_vals_ne",
464+
"Validate.col_vals_between",
465+
"Validate.col_vals_outside",
466+
"Validate.col_vals_in_set",
467+
"Validate.col_vals_not_in_set",
468+
"Validate.col_vals_null",
469+
"Validate.col_vals_not_null",
470+
"Validate.col_vals_regex",
471+
"Validate.col_vals_expr",
472+
"Validate.col_exists",
473+
"Validate.rows_distinct",
474+
"Validate.col_schema_match",
475+
"Validate.row_count_match",
476+
"Validate.col_count_match",
477+
]
478+
479+
column_selection_exported = [
480+
"col",
481+
"starts_with",
482+
"ends_with",
483+
"contains",
484+
"matches",
485+
"everything",
486+
"first_n",
487+
"last_n",
488+
]
489+
490+
interrogation_exported = [
491+
"Validate.interrogate",
492+
"Validate.get_tabular_report",
493+
"Validate.get_step_report",
494+
"Validate.get_json_report",
495+
"Validate.get_sundered_data",
496+
"Validate.get_data_extracts",
497+
"Validate.all_passed",
498+
"Validate.n",
499+
"Validate.n_passed",
500+
"Validate.n_failed",
501+
"Validate.f_passed",
502+
"Validate.f_failed",
503+
"Validate.warn",
504+
"Validate.stop",
505+
"Validate.notify",
506+
]
507+
508+
utilities_exported = [
509+
"load_dataset",
510+
"preview",
511+
"get_column_count",
512+
"get_row_count",
513+
"config",
514+
]
515+
516+
validate_desc = """When peforming data validation, you'll need the `Validate` class to get the
517+
process started. It's given the target table and you can optionally provide some metadata and/or
518+
failure thresholds (using the `Thresholds` class or through shorthands for this task). The
519+
`Validate` class has numerous methods for defining validation steps and for obtaining
520+
post-interrogation metrics and data."""
521+
522+
val_steps_desc = """Validation steps can be thought of as sequential validations on the target
523+
data. We call `Validate`'s validation methods to build up a validation plan: a collection of steps
524+
that, in the aggregate, provides good validation coverage."""
525+
526+
column_selection_desc = """A flexible way to select columns for validation is to use the `col()`
527+
function along with column selection helper functions. A combination of `col()` + `starts_with()`,
528+
`matches()`, etc., allows for the selection of multiple target columns (mapping a validation across
529+
many steps). Furthermore, the `col()` function can be used to declare a comparison column (e.g.,
530+
for the `value=` argument in many `col_vals_*()` methods) when you can't use a fixed value
531+
for comparison."""
532+
533+
interrogation_desc = """The validation plan is put into action when `interrogate()` is called.
534+
The workflow for performing a comprehensive validation is then: (1) `Validate()`, (2) adding
535+
validation steps, (3) `interrogate()`. After interrogation of the data, we can view a validation
536+
report table (by printing the object or using `get_tabular_report()`), extract key metrics, or we
537+
can split the data based on the validation results (with `get_sundered_data()`)."""
538+
539+
utilities_desc = """The utilities group contains functions that are helpful for the validation
540+
process. We can load datasets with `load_dataset()`, preview a table with `preview()`, and set
541+
global configuration parameters with `config()`."""
542+
543+
#
544+
# Add headings (`*_desc` text) and API details for each family of functions/methods
545+
#
546+
547+
api_text += f"""\n## The Validate family\n\n{validate_desc}\n\n"""
548+
api_text += get_api_details(module=pointblank, exported_list=validate_exported)
549+
550+
api_text += f"""\n## The Validation Steps family\n\n{val_steps_desc}\n\n"""
551+
api_text += get_api_details(module=pointblank, exported_list=val_steps_exported)
552+
553+
api_text += f"""\n## The Column Selection family\n\n{column_selection_desc}\n\n"""
554+
api_text += get_api_details(module=pointblank, exported_list=column_selection_exported)
555+
556+
api_text += f"""\n## The Interrogation and Reporting family\n\n{interrogation_desc}\n\n"""
557+
api_text += get_api_details(module=pointblank, exported_list=interrogation_exported)
558+
559+
api_text += f"""\n## The Utilities family\n\n{utilities_desc}\n\n"""
560+
api_text += get_api_details(module=pointblank, exported_list=utilities_exported)
561+
562+
# Modify language syntax in all code cells
563+
api_text = api_text.replace("{python}", "python")
564+
565+
# Remove code cells that contain `#| echo: false` (i.e., don't display the code)
566+
api_text = re.sub(r"```python\n\s*.*\n\s*.*\n.*\n.*\n.*```\n\s*", "", api_text)
567+
568+
return api_text
569+
570+
571+
def _get_examples_text() -> str:
572+
"""
573+
Get the examples for the Pointblank library. These examples are extracted from the Quarto
574+
documents in the `docs/demos` directory.
575+
576+
Returns
577+
-------
578+
str
579+
The examples for the Pointblank library.
580+
"""
581+
582+
sep_line = "-" * 70
583+
584+
examples_text = (
585+
f"{sep_line}\nThis is a set of examples for the Pointblank library.\n{sep_line}\n\n"
586+
)
587+
588+
# A large set of examples is available in the docs/demos directory, and each of the
589+
# subdirectories contains a different example (in the form of a Quarto document)
590+
591+
example_dirs = [
592+
"01-starter",
593+
"02-advanced",
594+
"03-data-extracts",
595+
"04-sundered-data",
596+
"05-step-report-column-check",
597+
"06-step-report-schema-check",
598+
"apply-checks-to-several-columns",
599+
"check-row-column-counts",
600+
"checks-for-missing",
601+
"col-vals-custom-expr",
602+
"column-selector-functions",
603+
"comparisons-across-columns",
604+
"expect-no-duplicate-rows",
605+
"expect-no-duplicate-values",
606+
"expect-text-pattern",
607+
"failure-thresholds",
608+
"mutate-table-in-step",
609+
"numeric-comparisons",
610+
"schema-check",
611+
"set-membership",
612+
"using-parquet-data",
613+
]
614+
615+
for example_dir in example_dirs:
616+
617+
link = f"https://posit-dev.github.io/pointblank/demos/{example_dir}/"
618+
619+
# Read in the index.qmd file for each example
620+
with open(f"docs/demos/{example_dir}/index.qmd", "r") as f:
621+
example_text = f.read()
622+
623+
# Remove the first eight lines of the example text (contains the YAML front matter)
624+
example_text = "\n".join(example_text.split("\n")[8:])
625+
626+
# Extract the title of the example (the line beginning with `###`)
627+
title = re.search(r"### (.*)", example_text).group(1)
628+
629+
# The next line with text is the short description of the example
630+
desc = re.search(r"(.*)\.", example_text).group(1)
631+
632+
# Get all of the Python code blocks in the example
633+
# these can be identified as starting with ```python and ending with ```
634+
code_blocks = re.findall(r"```python\n(.*?)```", example_text, re.DOTALL)
635+
636+
# Wrap each code block with a leading ```python and trailing ```
637+
code_blocks = [f"```python\n{code}```" for code in code_blocks]
638+
639+
# Collapse all code blocks into a single string
640+
code_text = "\n\n".join(code_blocks)
641+
642+
# Add the example title, description, and code to the examples text
643+
examples_text += f"### {title} ({link})\n\n{desc}\n\n{code_text}\n\n"
644+
645+
return examples_text
646+
647+
648+
def _get_api_and_examples_text() -> str:
649+
"""
650+
Get the combined API and examples text for the Pointblank library.
651+
652+
Returns
653+
-------
654+
str
655+
The combined API and examples text for the Pointblank library.
656+
"""
657+
658+
api_text = _get_api_text()
659+
examples_text = _get_examples_text()
660+
661+
return f"{api_text}\n\n{examples_text}"

tests/test__utils.py

+18
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@
2121
_check_invalid_fields,
2222
_select_df_lib,
2323
_get_tbl_type,
24+
_get_api_text,
25+
_get_examples_text,
26+
_get_api_and_examples_text,
2427
)
2528

2629

@@ -426,3 +429,18 @@ def test_get_tbl_type():
426429

427430
assert _get_tbl_type(pd.DataFrame()) == "pandas"
428431
assert _get_tbl_type(pl.DataFrame()) == "polars"
432+
433+
434+
def test_get_api_text():
435+
436+
assert isinstance(_get_api_text(), str)
437+
438+
439+
def test_get_examples_text():
440+
441+
assert isinstance(_get_examples_text(), str)
442+
443+
444+
def test_get_api_and_examples_text():
445+
446+
assert isinstance(_get_api_and_examples_text(), str)

0 commit comments

Comments
 (0)