Skip to content

Commit 36422a8

Browse files
PKG: Exclude data test files. (#19535)
1 parent 95427d5 commit 36422a8

36 files changed

+392
-347
lines changed

MANIFEST.in

+23-11
Original file line numberDiff line numberDiff line change
@@ -3,27 +3,39 @@ include LICENSE
33
include RELEASE.md
44
include README.md
55
include setup.py
6-
include pyproject.toml
76

87
graft doc
98
prune doc/build
109

10+
graft LICENSES
11+
1112
graft pandas
1213

13-
global-exclude *.so
14-
global-exclude *.pyd
14+
global-exclude *.bz2
15+
global-exclude *.csv
16+
global-exclude *.dta
17+
global-exclude *.gz
18+
global-exclude *.h5
19+
global-exclude *.html
20+
global-exclude *.json
21+
global-exclude *.msgpack
22+
global-exclude *.pickle
23+
global-exclude *.png
1524
global-exclude *.pyc
25+
global-exclude *.pyd
26+
global-exclude *.sas7bdat
27+
global-exclude *.so
28+
global-exclude *.xls
29+
global-exclude *.xlsm
30+
global-exclude *.xlsx
31+
global-exclude *.xpt
32+
global-exclude *.xz
33+
global-exclude *.zip
1634
global-exclude *~
17-
global-exclude \#*
18-
global-exclude .git*
1935
global-exclude .DS_Store
20-
global-exclude *.png
36+
global-exclude .git*
37+
global-exclude \#*
2138

22-
# include examples/data/*
23-
# recursive-include examples *.py
24-
# recursive-include doc/source *
25-
# recursive-include doc/sphinxext *
26-
# recursive-include LICENSES *
2739
include versioneer.py
2840
include pandas/_version.py
2941
include pandas/io/formats/templates/*.tpl

ci/script_single.sh

+4-4
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,12 @@ if [ "$DOC" ]; then
2525
echo "We are not running pytest as this is a doc-build"
2626

2727
elif [ "$COVERAGE" ]; then
28-
echo pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas
29-
pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas
28+
echo pytest -s -m "single" -r xXs --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas
29+
pytest -s -m "single" -r xXs --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas
3030

3131
else
32-
echo pytest -m "single" -r xX --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas
33-
pytest -m "single" -r xX --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas # TODO: doctest
32+
echo pytest -m "single" -r xXs --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas
33+
pytest -m "single" -r xXs --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas # TODO: doctest
3434

3535
fi
3636

doc/source/whatsnew/v0.23.2.txt

+5
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,11 @@ Documentation Changes
7676
-
7777
-
7878

79+
Build Changes
80+
-------------
81+
82+
- The source and binary distributions no longer include test data files, resulting in smaller download sizes. Tests relying on these data files will be skipped when using ``pandas.test()``. (:issue:`19320`)
83+
7984
.. _whatsnew_0232.bug_fixes:
8085

8186
Bug Fixes

pandas/conftest.py

+41
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1+
import os
12
import importlib
23

34
import pytest
45

6+
import pandas
57
import numpy as np
68
import pandas as pd
79
from pandas.compat import PY3
@@ -17,6 +19,8 @@ def pytest_addoption(parser):
1719
help="run high memory tests")
1820
parser.addoption("--only-slow", action="store_true",
1921
help="run only slow tests")
22+
parser.addoption("--strict-data-files", action="store_true",
23+
help="Fail if a test is skipped for missing data file.")
2024

2125

2226
def pytest_runtest_setup(item):
@@ -131,6 +135,43 @@ def join_type(request):
131135
return request.param
132136

133137

138+
@pytest.fixture
139+
def datapath(request):
140+
"""Get the path to a data file.
141+
142+
Parameters
143+
----------
144+
path : str
145+
Path to the file, relative to ``pandas/tests/``
146+
147+
Returns
148+
-------
149+
path : path including ``pandas/tests``.
150+
151+
Raises
152+
------
153+
ValueError
154+
If the path doesn't exist and the --strict-data-files option is set.
155+
"""
156+
def deco(*args):
157+
path = os.path.join('pandas', 'tests', *args)
158+
if not os.path.exists(path):
159+
if request.config.getoption("--strict-data-files"):
160+
msg = "Could not find file {} and --strict-data-files is set."
161+
raise ValueError(msg.format(path))
162+
else:
163+
msg = "Could not find {}."
164+
pytest.skip(msg.format(path))
165+
return path
166+
return deco
167+
168+
169+
@pytest.fixture
170+
def iris(datapath):
171+
"""The iris dataset as a DataFrame."""
172+
return pandas.read_csv(datapath('data', 'iris.csv'))
173+
174+
134175
@pytest.fixture(params=['nlargest', 'nsmallest'])
135176
def nselect_method(request):
136177
"""

pandas/tests/indexes/test_multi.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -1182,12 +1182,12 @@ def test_iter(self):
11821182
('baz', 'two'), ('qux', 'one'), ('qux', 'two')]
11831183
assert result == expected
11841184

1185-
def test_legacy_pickle(self):
1185+
def test_legacy_pickle(self, datapath):
11861186
if PY3:
11871187
pytest.skip("testing for legacy pickles not "
11881188
"support on py3")
11891189

1190-
path = tm.get_data_path('multiindex_v1.pickle')
1190+
path = datapath('indexes', 'data', 'multiindex_v1.pickle')
11911191
obj = pd.read_pickle(path)
11921192

11931193
obj2 = MultiIndex.from_tuples(obj.values)
@@ -1203,10 +1203,10 @@ def test_legacy_pickle(self):
12031203
assert_almost_equal(res, exp)
12041204
assert_almost_equal(exp, exp2)
12051205

1206-
def test_legacy_v2_unpickle(self):
1206+
def test_legacy_v2_unpickle(self, datapath):
12071207

12081208
# 0.7.3 -> 0.8.0 format manage
1209-
path = tm.get_data_path('mindex_073.pickle')
1209+
path = datapath('indexes', 'data', 'mindex_073.pickle')
12101210
obj = pd.read_pickle(path)
12111211

12121212
obj2 = MultiIndex.from_tuples(obj.values)

pandas/tests/io/conftest.py

+6-15
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,23 @@
1-
import os
2-
31
import pytest
42
from pandas.io.parsers import read_table
5-
from pandas.util import testing as tm
6-
7-
8-
@pytest.fixture
9-
def parser_data(request):
10-
return os.path.join(tm.get_data_path(), '..', 'parser', 'data')
113

124

135
@pytest.fixture
14-
def tips_file(parser_data):
6+
def tips_file(datapath):
157
"""Path to the tips dataset"""
16-
return os.path.join(parser_data, 'tips.csv')
8+
return datapath('io', 'parser', 'data', 'tips.csv')
179

1810

1911
@pytest.fixture
20-
def jsonl_file(parser_data):
12+
def jsonl_file(datapath):
2113
"""Path a JSONL dataset"""
22-
return os.path.join(parser_data, 'items.jsonl')
14+
return datapath('io', 'parser', 'data', 'items.jsonl')
2315

2416

2517
@pytest.fixture
26-
def salaries_table(parser_data):
18+
def salaries_table(datapath):
2719
"""DataFrame with the salaries dataset"""
28-
path = os.path.join(parser_data, 'salaries.csv')
29-
return read_table(path)
20+
return read_table(datapath('io', 'parser', 'data', 'salaries.csv'))
3021

3122

3223
@pytest.fixture

pandas/tests/io/formats/test_format.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -916,8 +916,8 @@ def test_unicode_problem_decoding_as_ascii(self):
916916
dm = DataFrame({u('c/\u03c3'): Series({'test': np.nan})})
917917
compat.text_type(dm.to_string())
918918

919-
def test_string_repr_encoding(self):
920-
filepath = tm.get_data_path('unicode_series.csv')
919+
def test_string_repr_encoding(self, datapath):
920+
filepath = datapath('io', 'formats', 'data', 'unicode_series.csv')
921921
df = pd.read_csv(filepath, header=None, encoding='latin1')
922922
repr(df)
923923
repr(df[1])

pandas/tests/io/json/test_compression.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,11 @@ def test_compression_roundtrip(compression):
2121
assert_frame_equal(df, pd.read_json(result))
2222

2323

24-
def test_read_zipped_json():
25-
uncompressed_path = tm.get_data_path("tsframe_v012.json")
24+
def test_read_zipped_json(datapath):
25+
uncompressed_path = datapath("io", "json", "data", "tsframe_v012.json")
2626
uncompressed_df = pd.read_json(uncompressed_path)
2727

28-
compressed_path = tm.get_data_path("tsframe_v012.json.zip")
28+
compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip")
2929
compressed_df = pd.read_json(compressed_path, compression='zip')
3030

3131
assert_frame_equal(uncompressed_df, compressed_df)

pandas/tests/io/json/test_pandas.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,9 @@
3737

3838
class TestPandasContainer(object):
3939

40-
def setup_method(self, method):
41-
self.dirpath = tm.get_data_path()
40+
@pytest.fixture(scope="function", autouse=True)
41+
def setup(self, datapath):
42+
self.dirpath = datapath("io", "json", "data")
4243

4344
self.ts = tm.makeTimeSeries()
4445
self.ts.name = 'ts'
@@ -59,7 +60,8 @@ def setup_method(self, method):
5960
self.mixed_frame = _mixed_frame.copy()
6061
self.categorical = _cat_frame.copy()
6162

62-
def teardown_method(self, method):
63+
yield
64+
6365
del self.dirpath
6466

6567
del self.ts

pandas/tests/io/parser/common.py

+11-14
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def test_read_csv(self):
7777
else:
7878
prefix = u("file://")
7979

80-
fname = prefix + compat.text_type(self.csv1)
80+
fname = prefix + compat.text_type(os.path.abspath(self.csv1))
8181
self.read_csv(fname, index_col=0, parse_dates=True)
8282

8383
def test_1000_sep(self):
@@ -651,21 +651,19 @@ def test_read_csv_parse_simple_list(self):
651651
tm.assert_frame_equal(df, expected)
652652

653653
@tm.network
654-
def test_url(self):
654+
def test_url(self, datapath):
655655
# HTTP(S)
656656
url = ('https://raw.github.com/pandas-dev/pandas/master/'
657657
'pandas/tests/io/parser/data/salaries.csv')
658658
url_table = self.read_table(url)
659-
dirpath = tm.get_data_path()
660-
localtable = os.path.join(dirpath, 'salaries.csv')
659+
localtable = datapath('io', 'parser', 'data', 'salaries.csv')
661660
local_table = self.read_table(localtable)
662661
tm.assert_frame_equal(url_table, local_table)
663662
# TODO: ftp testing
664663

665664
@pytest.mark.slow
666-
def test_file(self):
667-
dirpath = tm.get_data_path()
668-
localtable = os.path.join(dirpath, 'salaries.csv')
665+
def test_file(self, datapath):
666+
localtable = datapath('io', 'parser', 'data', 'salaries.csv')
669667
local_table = self.read_table(localtable)
670668

671669
try:
@@ -755,8 +753,8 @@ def test_utf16_bom_skiprows(self):
755753

756754
tm.assert_frame_equal(result, expected)
757755

758-
def test_utf16_example(self):
759-
path = tm.get_data_path('utf16_ex.txt')
756+
def test_utf16_example(self, datapath):
757+
path = datapath('io', 'parser', 'data', 'utf16_ex.txt')
760758

761759
# it works! and is the right length
762760
result = self.read_table(path, encoding='utf-16')
@@ -767,8 +765,8 @@ def test_utf16_example(self):
767765
result = self.read_table(buf, encoding='utf-16')
768766
assert len(result) == 50
769767

770-
def test_unicode_encoding(self):
771-
pth = tm.get_data_path('unicode_series.csv')
768+
def test_unicode_encoding(self, datapath):
769+
pth = datapath('io', 'parser', 'data', 'unicode_series.csv')
772770

773771
result = self.read_csv(pth, header=None, encoding='latin-1')
774772
result = result.set_index(0)
@@ -1513,10 +1511,9 @@ def test_internal_eof_byte_to_file(self):
15131511
result = self.read_csv(path)
15141512
tm.assert_frame_equal(result, expected)
15151513

1516-
def test_sub_character(self):
1514+
def test_sub_character(self, datapath):
15171515
# see gh-16893
1518-
dirpath = tm.get_data_path()
1519-
filename = os.path.join(dirpath, "sub_char.csv")
1516+
filename = datapath('io', 'parser', 'data', 'sub_char.csv')
15201517

15211518
expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"])
15221519
result = self.read_csv(filename)

pandas/tests/io/parser/compression.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -120,9 +120,9 @@ def test_read_csv_infer_compression(self):
120120

121121
tm.assert_frame_equal(expected, df)
122122

123-
def test_read_csv_compressed_utf16_example(self):
123+
def test_read_csv_compressed_utf16_example(self, datapath):
124124
# GH18071
125-
path = tm.get_data_path('utf16_ex_small.zip')
125+
path = datapath('io', 'parser', 'data', 'utf16_ex_small.zip')
126126

127127
result = self.read_csv(path, encoding='utf-16',
128128
compression='zip', sep='\t')

pandas/tests/io/parser/dtypes.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -125,17 +125,17 @@ def test_categorical_dtype_high_cardinality_numeric(self):
125125
np.sort(actual.a.cat.categories), ordered=True)
126126
tm.assert_frame_equal(actual, expected)
127127

128-
def test_categorical_dtype_encoding(self):
128+
def test_categorical_dtype_encoding(self, datapath):
129129
# GH 10153
130-
pth = tm.get_data_path('unicode_series.csv')
130+
pth = datapath('io', 'parser', 'data', 'unicode_series.csv')
131131
encoding = 'latin-1'
132132
expected = self.read_csv(pth, header=None, encoding=encoding)
133133
expected[1] = Categorical(expected[1])
134134
actual = self.read_csv(pth, header=None, encoding=encoding,
135135
dtype={1: 'category'})
136136
tm.assert_frame_equal(actual, expected)
137137

138-
pth = tm.get_data_path('utf16_ex.txt')
138+
pth = datapath('io', 'parser', 'data', 'utf16_ex.txt')
139139
encoding = 'utf-16'
140140
expected = self.read_table(pth, encoding=encoding)
141141
expected = expected.apply(Categorical)

0 commit comments

Comments
 (0)