From 8b29d18259a2ded1074c89607721dd2186eeb451 Mon Sep 17 00:00:00 2001 From: Brahmanand Singh Date: Sat, 26 Jun 2021 19:53:40 +0530 Subject: [PATCH 1/7] No new modules needed on python3+ --- requirements.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 4e2f0a7..0000000 --- a/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -unicodecsv==0.9.0 From 59d5a2d6e9466cd90f6eddf332d744b073ebc9da Mon Sep 17 00:00:00 2001 From: Brahmanand Singh Date: Sat, 26 Jun 2021 19:58:41 +0530 Subject: [PATCH 2/7] changes for python3, using csv and unix dialect --- json2csv.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/json2csv.py b/json2csv.py index e70ef12..685a5f4 100755 --- a/json2csv.py +++ b/json2csv.py @@ -1,15 +1,15 @@ #!/usr/bin/env python -try: - import unicodecsv as csv -except ImportError: - import csv - +#python3 is by default unicode +import csv import json import operator import os from collections import OrderedDict import logging +import argparse +#reduce is part of functools for py3 +import functools as ft logging.basicConfig(level=logging.DEBUG) @@ -59,7 +59,7 @@ def process_each(self, data): data = data[self.collection] for d in data: - logging.info(d) + #logging.info(d) self.rows.append(self.process_row(d)) def process_row(self, item): @@ -69,7 +69,7 @@ def process_row(self, item): for header, keys in self.key_map.items(): try: - row[header] = reduce(operator.getitem, keys, item) + row[header] = ft.reduce(operator.getitem, keys, item) except (KeyError, IndexError, TypeError): row[header] = None @@ -88,7 +88,7 @@ def make_string(self, item): elif isinstance(item, dict): return self.DICT_OPEN + self.DICT_SEP_CHAR.join([self.KEY_VAL_CHAR.join([k, self.make_string(val)]) for k, val in item.items()]) + self.DICT_CLOSE else: - return unicode(item) + return item def write_csv(self, filename='output.csv', make_strings=False): """Write the processed rows to the given filename @@ -99,8 +99,9 @@ def write_csv(self, filename='output.csv', make_strings=False): out = self.make_strings() else: out = self.rows - with open(filename, 'wb+') as f: - writer = csv.DictWriter(f, self.key_map.keys()) + #opening with write mode only and specifying unix dilect to quote all fields + with open(filename, 'w') as f: + writer = csv.DictWriter(f, self.key_map.keys(), dialect='unix') writer.writeheader() writer.writerows(out) @@ -119,7 +120,6 @@ def process_each(self, data, collection=None): def init_parser(): - import argparse parser = argparse.ArgumentParser(description="Converts JSON to CSV") parser.add_argument('json_file', type=argparse.FileType('r'), help="Path to JSON data file to load") @@ -153,3 +153,4 @@ def init_parser(): outfile = fileName + '.csv' loader.write_csv(filename=outfile, make_strings=args.strings) + From 48e7a30fc748deb4121ea3b6c60cb3fe00addd9b Mon Sep 17 00:00:00 2001 From: Brahmanand Singh Date: Sat, 26 Jun 2021 19:59:56 +0530 Subject: [PATCH 3/7] py3 changes and removed json key sorting --- get_outline.py | 88 ++++++++++++++++++++++ tests.py | 193 ------------------------------------------------- 2 files changed, 88 insertions(+), 193 deletions(-) create mode 100644 get_outline.py diff --git a/get_outline.py b/get_outline.py new file mode 100644 index 0000000..3067d6b --- /dev/null +++ b/get_outline.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python + +import json +import os, os.path + +def key_paths(d): + def helper(path, x): + if isinstance(x, dict): + for k, v in x.items(): + for ret in helper(path + [k], v): + yield ret + elif isinstance(x, list): + for i, item in enumerate(x): + for ret in helper(path + [i], item): + yield ret + else: + yield path + return helper([], d) + +def line_iter(f): + for line in f: + yield json.loads(line) + +def coll_iter(f, coll_key): + data = json.load(f) + for obj in data[coll_key]: + yield obj + +def gather_key_map(iterator): + key_map = {} + for d in iterator: + for path in key_paths(d): + key_map[tuple(path)] = True + return key_map + +def path_join(path, sep='.'): + return sep.join(str(k) for k in path) + +def key_map_to_list(key_map): + # We convert to strings *after* sorting so that array indices come out + # in the correct order. + # return [(path_join(k, '_'), path_join(k)) for k in sorted(key_map.keys())] + # to get keys in the order defined in json ,not doing any sort + return [(path_join(k, '_'), path_join(k)) for k in key_map.keys()] + +def make_outline(json_file, each_line, collection_key): + if each_line: + iterator = line_iter(json_file) + else: + iterator = coll_iter(json_file, collection_key) + + key_map = gather_key_map(iterator) + outline = {'map': key_map_to_list(key_map)} + if collection_key: + outline['collection'] = collection_key + + return outline + +def init_parser(): + import argparse + parser = argparse.ArgumentParser(description="Generate an outline file for json2csv.py") + parser.add_argument('json_file', type=argparse.FileType('r'), + help="Path to JSON data file to analyze") + parser.add_argument('-o', '--output-file', type=str, default=None, + help="Path to outline file to output") + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument('-e', '--each-line', action="store_true", default=False, + help="Process each line of JSON file separately") + group.add_argument('-c', '--collection', type=str, default=None, + help="Key in JSON of array to process", metavar="KEY") + + return parser + +def main(): + parser = init_parser() + args = parser.parse_args() + outline = make_outline(args.json_file, args.each_line, args.collection) + outfile = args.output_file + if outfile is None: + fileName, fileExtension = os.path.splitext(args.json_file.name) + outfile = fileName + '.outline.json' +#not soring the json keys + with open(outfile, 'w') as f: + json.dump(outline, f, indent=2, sort_keys=False) + +if __name__ == '__main__': + main() + diff --git a/tests.py b/tests.py index 821e4c3..e69de29 100644 --- a/tests.py +++ b/tests.py @@ -1,193 +0,0 @@ -import unittest -import json -from json2csv import Json2Csv, MultiLineJson2Csv -from gen_outline import make_outline - - -class TestJson2Csv(unittest.TestCase): - - def test_init(self): - outline = {'map': [['some_header', 'some_key']]} - loader = Json2Csv(outline) - self.assertIn('some_header', loader.key_map) - - self.assertRaises(ValueError, Json2Csv, None) - - self.assertRaises(ValueError, Json2Csv, {}) - - def test_process_row(self): - """Given a valid key-map and data, it should return a valid row""" - outline = {'map': [['id', '_id'], ['count', 'count']]} - loader = Json2Csv(outline) - test_data = json.loads('{"_id" : "Someone","count" : 1}') - row = loader.process_row(test_data) - - self.assertIs(type(row), dict) - self.assertIn('id', row.keys()) - self.assertIn('count', row.keys()) - - self.assertEquals(row['id'], 'Someone') - self.assertEquals(row['count'], 1) - - def test_process_row_nested_data(self): - """Ensure that nested keys (with . notation) are processed""" - key_map = {"map": [['author', 'source.author'], ['message', 'message.original']]} - loader = Json2Csv(key_map) - test_data = json.loads( - '{"source": {"author": "Someone"}, "message": {"original": "Hey!", "Revised": "Hey yo!"}}' - ) - row = loader.process_row(test_data) - - self.assertIs(type(row), dict) - self.assertIn('author', row.keys()) - self.assertIn('message', row.keys()) - - self.assertEquals(row['author'], 'Someone') - self.assertEquals(row['message'], 'Hey!') - - def test_process_row_array_index(self): - """Ensure that array indices are properly handled as part of the dot notation""" - pass - - def test_process_each(self): - outline = {'map': [['id', '_id'], ['count', 'count']], 'collection': 'result'} - loader = Json2Csv(outline) - - test_data = json.loads('{"result":[{"_id" : "Someone","count" : 1}]}') - loader.process_each(test_data) - - self.assertEquals(len(loader.rows), 1) - row = loader.rows[0] - self.assertIs(type(row), dict) - self.assertIn('id', row.keys()) - self.assertIn('count', row.keys()) - - self.assertEquals(row['id'], 'Someone') - self.assertEquals(row['count'], 1) - - def test_process_each_optional_key(self): - """Ensure a key that is not always present won't prevent data extraction - Where the data is missing, None is returned - """ - outline = {'map': [['id', '_id'], ['count', 'count'], ['tags_0', 'tags.0']]} - loader = Json2Csv(outline) - - test_data = json.loads('''[ - {"_id": "Someone","count": 1, "tags": ["super"]}, - {"_id": "Another", "tags": []}]''') - self.assertEquals(len(test_data), 2) - loader.process_each(test_data) - - self.assertEquals(len(loader.rows), 2) - second_row = loader.rows[1] - self.assertEquals(second_row['id'], 'Another') - # works for missing dict keys - self.assertIsNone(second_row['count']) - # and missing list indices - self.assertIsNone(second_row['tags_0']) - - def test_load_json(self): - outline = {"map": [['author', 'source.author'], ['message', 'message.original']], "collection": "nodes"} - loader = Json2Csv(outline) - with open('fixtures/data.json') as f: - loader.load(f) - - first_row = loader.rows[0] - self.assertEqual(first_row['author'], 'Someone') - second_row = loader.rows[1] - self.assertEqual(second_row['author'], 'Another') - third_row = loader.rows[2] - self.assertEqual(third_row['author'], 'Me too') - - def test_load_bare_json(self): - outline = {"map": [['author', 'source.author'], ['message', 'message.original']]} - loader = Json2Csv(outline) - with open('fixtures/bare_data.json') as f: - loader.load(f) - - first_row = loader.rows[0] - self.assertEqual(first_row['author'], 'Someone') - second_row = loader.rows[1] - self.assertEqual(second_row['author'], 'Another') - third_row = loader.rows[2] - self.assertEqual(third_row['author'], 'Me too') - - def test_write_csv(self): - pass - - -class TestMultiLineJson2Csv(unittest.TestCase): - - def test_line_delimited(self): - outline = {"map": [['author', 'source.author'], ['message', 'message.original']]} - loader = MultiLineJson2Csv(outline) - with open('fixtures/line_delimited.json') as f: - loader.load(f) - - first_row = loader.rows[0] - self.assertEqual(first_row['author'], 'Someone') - second_row = loader.rows[1] - self.assertEqual(second_row['author'], 'Another') - third_row = loader.rows[2] - self.assertEqual(third_row['author'], 'Me too') - - -class TestGenOutline(unittest.TestCase): - - def test_basic(self): - with open('fixtures/data.json') as json_file: - outline = make_outline(json_file, False, 'nodes') - expected = { - 'collection': 'nodes', - 'map': [ - ('message_Revised', 'message.Revised'), - ('message_original', 'message.original'), - ('source_author', 'source.author'), - ] - } - self.assertEqual(outline, expected) - - def test_deeply_nested(self): - with open('fixtures/deeply_nested.json') as json_file: - outline = make_outline(json_file, False, 'nodes') - expected = { - 'collection': 'nodes', - 'map': [ - ('one_0_two_0_three_0', 'one.0.two.0.three.0'), - ('one_0_two_0_three_1', 'one.0.two.0.three.1'), - ('one_0_two_0_three_2', 'one.0.two.0.three.2'), - ('one_0_two_1_three_0', 'one.0.two.1.three.0'), - ('one_0_two_1_three_1', 'one.0.two.1.three.1'), - ('one_0_two_1_three_2', 'one.0.two.1.three.2'), - ] - } - self.assertEqual(outline, expected) - - def test_different_keys_per_row(self): - "Outline should contain the union of the keys." - with open('fixtures/different_keys_per_row.json') as json_file: - outline = make_outline(json_file, False, 'nodes') - expected = { - 'collection': 'nodes', - 'map': [ - ('tags_0', 'tags.0'), - ('tags_1', 'tags.1'), - ('tags_2', 'tags.2'), - ('that', 'that'), - ('theother', 'theother'), - ('this', 'this'), - ] - } - self.assertEqual(outline, expected) - - def test_line_delimited(self): - with open('fixtures/line_delimited.json') as json_file: - outline = make_outline(json_file, True, None) - expected = { - 'map': [ - ('message_Revised', 'message.Revised'), - ('message_original', 'message.original'), - ('source_author', 'source.author'), - ] - } - self.assertEqual(outline, expected) From 06b1cd63112fc9358c237935988cadd4ae076559 Mon Sep 17 00:00:00 2001 From: Brahmanand Singh Date: Sat, 26 Jun 2021 20:00:41 +0530 Subject: [PATCH 4/7] Adjusted test cases accordingly --- tests.py | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) diff --git a/tests.py b/tests.py index e69de29..91b9e44 100644 --- a/tests.py +++ b/tests.py @@ -0,0 +1,194 @@ +import unittest +import json +from json2csv import Json2Csv, MultiLineJson2Csv +from gen_outline import make_outline + + +class TestJson2Csv(unittest.TestCase): + + def test_init(self): + outline = {'map': [['some_header', 'some_key']]} + loader = Json2Csv(outline) + self.assertIn('some_header', loader.key_map) + + self.assertRaises(ValueError, Json2Csv, None) + + self.assertRaises(ValueError, Json2Csv, {}) + + def test_process_row(self): + """Given a valid key-map and data, it should return a valid row""" + outline = {'map': [['id', '_id'], ['count', 'count']]} + loader = Json2Csv(outline) + test_data = json.loads('{"_id" : "Someone","count" : 1}') + row = loader.process_row(test_data) + + self.assertIs(type(row), dict) + self.assertIn('id', row.keys()) + self.assertIn('count', row.keys()) + + self.assertEqual(row['id'], 'Someone') + self.assertEqual(row['count'], 1) + + def test_process_row_nested_data(self): + """Ensure that nested keys (with . notation) are processed""" + key_map = {"map": [['author', 'source.author'], ['message', 'message.original']]} + loader = Json2Csv(key_map) + test_data = json.loads( + '{"source": {"author": "Someone"}, "message": {"original": "Hey!", "Revised": "Hey yo!"}}' + ) + row = loader.process_row(test_data) + + self.assertIs(type(row), dict) + self.assertIn('author', row.keys()) + self.assertIn('message', row.keys()) + + self.assertEqual(row['author'], 'Someone') + self.assertEqual(row['message'], 'Hey!') + + def test_process_row_array_index(self): + """Ensure that array indices are properly handled as part of the dot notation""" + pass + + def test_process_each(self): + outline = {'map': [['id', '_id'], ['count', 'count']], 'collection': 'result'} + loader = Json2Csv(outline) + + test_data = json.loads('{"result":[{"_id" : "Someone","count" : 1}]}') + loader.process_each(test_data) + + self.assertEqual(len(loader.rows), 1) + row = loader.rows[0] + self.assertIs(type(row), dict) + self.assertIn('id', row.keys()) + self.assertIn('count', row.keys()) + + self.assertEqual(row['id'], 'Someone') + self.assertEqual(row['count'], 1) + + def test_process_each_optional_key(self): + """Ensure a key that is not always present won't prevent data extraction + Where the data is missing, None is returned + """ + outline = {'map': [['id', '_id'], ['count', 'count'], ['tags_0', 'tags.0']]} + loader = Json2Csv(outline) + + test_data = json.loads('''[ + {"_id": "Someone","count": 1, "tags": ["super"]}, + {"_id": "Another", "tags": []}]''') + self.assertEqual(len(test_data), 2) + loader.process_each(test_data) + + self.assertEqual(len(loader.rows), 2) + second_row = loader.rows[1] + self.assertEqual(second_row['id'], 'Another') + # works for missing dict keys + self.assertIsNone(second_row['count']) + # and missing list indices + self.assertIsNone(second_row['tags_0']) + + def test_load_json(self): + outline = {"map": [['author', 'source.author'], ['message', 'message.original']], "collection": "nodes"} + loader = Json2Csv(outline) + with open('fixtures/data.json') as f: + loader.load(f) + + first_row = loader.rows[0] + self.assertEqual(first_row['author'], 'Someone') + second_row = loader.rows[1] + self.assertEqual(second_row['author'], 'Another') + third_row = loader.rows[2] + self.assertEqual(third_row['author'], 'Me too') + + def test_load_bare_json(self): + outline = {"map": [['author', 'source.author'], ['message', 'message.original']]} + loader = Json2Csv(outline) + with open('fixtures/bare_data.json') as f: + loader.load(f) + + first_row = loader.rows[0] + self.assertEqual(first_row['author'], 'Someone') + second_row = loader.rows[1] + self.assertEqual(second_row['author'], 'Another') + third_row = loader.rows[2] + self.assertEqual(third_row['author'], 'Me too') + + def test_write_csv(self): + pass + + +class TestMultiLineJson2Csv(unittest.TestCase): + + def test_line_delimited(self): + outline = {"map": [['author', 'source.author'], ['message', 'message.original']]} + loader = MultiLineJson2Csv(outline) + with open('fixtures/line_delimited.json') as f: + loader.load(f) + + first_row = loader.rows[0] + self.assertEqual(first_row['author'], 'Someone') + second_row = loader.rows[1] + self.assertEqual(second_row['author'], 'Another') + third_row = loader.rows[2] + self.assertEqual(third_row['author'], 'Me too') + + +class TestGenOutline(unittest.TestCase): + + def test_basic(self): + with open('fixtures/data.json') as json_file: + outline = make_outline(json_file, False, 'nodes') + expected = { + 'collection': 'nodes', + 'map': [ + ('source_author', 'source.author'), + ('message_original', 'message.original'), + ('message_Revised', 'message.Revised'), + ] + } + self.assertEqual(outline, expected) + + def test_deeply_nested(self): + with open('fixtures/deeply_nested.json') as json_file: + outline = make_outline(json_file, False, 'nodes') + expected = { + 'collection': 'nodes', + 'map': [ + ('one_0_two_0_three_0', 'one.0.two.0.three.0'), + ('one_0_two_0_three_1', 'one.0.two.0.three.1'), + ('one_0_two_0_three_2', 'one.0.two.0.three.2'), + ('one_0_two_1_three_0', 'one.0.two.1.three.0'), + ('one_0_two_1_three_1', 'one.0.two.1.three.1'), + ('one_0_two_1_three_2', 'one.0.two.1.three.2'), + ] + } + self.assertEqual(outline, expected) + + def test_different_keys_per_row(self): + "Outline should contain the union of the keys." + with open('fixtures/different_keys_per_row.json') as json_file: + outline = make_outline(json_file, False, 'nodes') + expected = { + 'collection': 'nodes', + 'map': [ + ('this', 'this'), + ('tags_0', 'tags.0'), + ('tags_1', 'tags.1'), + ('tags_2', 'tags.2'), + ('that', 'that'), + ('theother', 'theother'), + ] + } + self.assertEqual(outline, expected) + + def test_line_delimited(self): + with open('fixtures/line_delimited.json') as json_file: + outline = make_outline(json_file, True, None) + expected = { + 'map': [ + ('source_author', 'source.author'), + ('message_original', 'message.original'), + ('message_Revised', 'message.Revised'), + ] + } + self.assertEqual(outline, expected) + From 8ca5c6fb5d5db874f6ae2a7f9b749a203d4f8bf4 Mon Sep 17 00:00:00 2001 From: Brahmanand Singh Date: Sat, 26 Jun 2021 20:32:38 +0530 Subject: [PATCH 5/7] fixes --- fixtures/data.outline.json | 17 ++++ fixtures/different_keys_per_row.outline.json | 29 +++++++ fixtures/outline.json | 20 +++-- gen_outline.py | 11 ++- get_outline.py | 88 -------------------- 5 files changed, 68 insertions(+), 97 deletions(-) create mode 100644 fixtures/data.outline.json create mode 100644 fixtures/different_keys_per_row.outline.json mode change 100755 => 100644 gen_outline.py delete mode 100644 get_outline.py diff --git a/fixtures/data.outline.json b/fixtures/data.outline.json new file mode 100644 index 0000000..50ad405 --- /dev/null +++ b/fixtures/data.outline.json @@ -0,0 +1,17 @@ +{ + "map": [ + [ + "message_original", + "message.original" + ], + [ + "source_author", + "source.author" + ], + [ + "message_Revised", + "message.Revised" + ] + ], + "collection": "nodes" +} \ No newline at end of file diff --git a/fixtures/different_keys_per_row.outline.json b/fixtures/different_keys_per_row.outline.json new file mode 100644 index 0000000..afcf8b8 --- /dev/null +++ b/fixtures/different_keys_per_row.outline.json @@ -0,0 +1,29 @@ +{ + "collection": "nodes", + "map": [ + [ + "that", + "that" + ], + [ + "tags_2", + "tags.2" + ], + [ + "theother", + "theother" + ], + [ + "this", + "this" + ], + [ + "tags_0", + "tags.0" + ], + [ + "tags_1", + "tags.1" + ] + ] +} \ No newline at end of file diff --git a/fixtures/outline.json b/fixtures/outline.json index b8d09b8..9844e7f 100644 --- a/fixtures/outline.json +++ b/fixtures/outline.json @@ -1,7 +1,17 @@ { - "map": [ - ["author", "source.author"], - ["message", "message.original"] + "map": [ + [ + "source_author", + "source.author" ], - "collection": "nodes" -} + [ + "message_Revised", + "message.Revised" + ], + [ + "message_original", + "message.original" + ] + ], + "collection": "nodes" +} \ No newline at end of file diff --git a/gen_outline.py b/gen_outline.py old mode 100755 new mode 100644 index 8ab20aa..3067d6b --- a/gen_outline.py +++ b/gen_outline.py @@ -6,7 +6,7 @@ def key_paths(d): def helper(path, x): if isinstance(x, dict): - for k, v in x.iteritems(): + for k, v in x.items(): for ret in helper(path + [k], v): yield ret elif isinstance(x, list): @@ -39,7 +39,9 @@ def path_join(path, sep='.'): def key_map_to_list(key_map): # We convert to strings *after* sorting so that array indices come out # in the correct order. - return [(path_join(k, '_'), path_join(k)) for k in sorted(key_map.keys())] + # return [(path_join(k, '_'), path_join(k)) for k in sorted(key_map.keys())] + # to get keys in the order defined in json ,not doing any sort + return [(path_join(k, '_'), path_join(k)) for k in key_map.keys()] def make_outline(json_file, each_line, collection_key): if each_line: @@ -77,9 +79,10 @@ def main(): if outfile is None: fileName, fileExtension = os.path.splitext(args.json_file.name) outfile = fileName + '.outline.json' - +#not soring the json keys with open(outfile, 'w') as f: - json.dump(outline, f, indent=2, sort_keys=True) + json.dump(outline, f, indent=2, sort_keys=False) if __name__ == '__main__': main() + diff --git a/get_outline.py b/get_outline.py deleted file mode 100644 index 3067d6b..0000000 --- a/get_outline.py +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env python - -import json -import os, os.path - -def key_paths(d): - def helper(path, x): - if isinstance(x, dict): - for k, v in x.items(): - for ret in helper(path + [k], v): - yield ret - elif isinstance(x, list): - for i, item in enumerate(x): - for ret in helper(path + [i], item): - yield ret - else: - yield path - return helper([], d) - -def line_iter(f): - for line in f: - yield json.loads(line) - -def coll_iter(f, coll_key): - data = json.load(f) - for obj in data[coll_key]: - yield obj - -def gather_key_map(iterator): - key_map = {} - for d in iterator: - for path in key_paths(d): - key_map[tuple(path)] = True - return key_map - -def path_join(path, sep='.'): - return sep.join(str(k) for k in path) - -def key_map_to_list(key_map): - # We convert to strings *after* sorting so that array indices come out - # in the correct order. - # return [(path_join(k, '_'), path_join(k)) for k in sorted(key_map.keys())] - # to get keys in the order defined in json ,not doing any sort - return [(path_join(k, '_'), path_join(k)) for k in key_map.keys()] - -def make_outline(json_file, each_line, collection_key): - if each_line: - iterator = line_iter(json_file) - else: - iterator = coll_iter(json_file, collection_key) - - key_map = gather_key_map(iterator) - outline = {'map': key_map_to_list(key_map)} - if collection_key: - outline['collection'] = collection_key - - return outline - -def init_parser(): - import argparse - parser = argparse.ArgumentParser(description="Generate an outline file for json2csv.py") - parser.add_argument('json_file', type=argparse.FileType('r'), - help="Path to JSON data file to analyze") - parser.add_argument('-o', '--output-file', type=str, default=None, - help="Path to outline file to output") - group = parser.add_mutually_exclusive_group(required=True) - group.add_argument('-e', '--each-line', action="store_true", default=False, - help="Process each line of JSON file separately") - group.add_argument('-c', '--collection', type=str, default=None, - help="Key in JSON of array to process", metavar="KEY") - - return parser - -def main(): - parser = init_parser() - args = parser.parse_args() - outline = make_outline(args.json_file, args.each_line, args.collection) - outfile = args.output_file - if outfile is None: - fileName, fileExtension = os.path.splitext(args.json_file.name) - outfile = fileName + '.outline.json' -#not soring the json keys - with open(outfile, 'w') as f: - json.dump(outline, f, indent=2, sort_keys=False) - -if __name__ == '__main__': - main() - From 35087d15dacd8c27bdc85b655b270ae4832168c5 Mon Sep 17 00:00:00 2001 From: Brahmanand Singh Date: Sat, 26 Jun 2021 23:40:27 +0530 Subject: [PATCH 6/7] fixes and unittest --- fixtures/data.outline.json | 32 +++++------ fixtures/different_keys_per_row.outline.json | 56 ++++++++++---------- fixtures/line_delimited.outline.json | 16 ++++++ fixtures/nested_json_frt.json | 46 ++++++++++++++++ fixtures/nested_json_frt.outline.json | 37 +++++++++++++ fixtures/outline.json | 32 +++++------ gen_outline.py | 9 ++-- tests.py | 11 ++-- 8 files changed, 168 insertions(+), 71 deletions(-) mode change 100644 => 100755 fixtures/data.outline.json mode change 100644 => 100755 fixtures/different_keys_per_row.outline.json create mode 100755 fixtures/line_delimited.outline.json create mode 100755 fixtures/nested_json_frt.json create mode 100755 fixtures/nested_json_frt.outline.json mode change 100644 => 100755 fixtures/outline.json mode change 100644 => 100755 gen_outline.py diff --git a/fixtures/data.outline.json b/fixtures/data.outline.json old mode 100644 new mode 100755 index 50ad405..a93ad96 --- a/fixtures/data.outline.json +++ b/fixtures/data.outline.json @@ -1,17 +1,17 @@ -{ - "map": [ - [ - "message_original", - "message.original" - ], - [ - "source_author", - "source.author" - ], - [ - "message_Revised", - "message.Revised" - ] - ], - "collection": "nodes" +{ + "collection": "nodes", + "map": [ + [ + "message_Revised", + "message.Revised" + ], + [ + "message_original", + "message.original" + ], + [ + "source_author", + "source.author" + ] + ] } \ No newline at end of file diff --git a/fixtures/different_keys_per_row.outline.json b/fixtures/different_keys_per_row.outline.json old mode 100644 new mode 100755 index afcf8b8..c502506 --- a/fixtures/different_keys_per_row.outline.json +++ b/fixtures/different_keys_per_row.outline.json @@ -1,29 +1,29 @@ -{ - "collection": "nodes", - "map": [ - [ - "that", - "that" - ], - [ - "tags_2", - "tags.2" - ], - [ - "theother", - "theother" - ], - [ - "this", - "this" - ], - [ - "tags_0", - "tags.0" - ], - [ - "tags_1", - "tags.1" - ] - ] +{ + "map": [ + [ + "this", + "this" + ], + [ + "tags_0", + "tags.0" + ], + [ + "tags_1", + "tags.1" + ], + [ + "tags_2", + "tags.2" + ], + [ + "that", + "that" + ], + [ + "theother", + "theother" + ] + ], + "collection": "nodes" } \ No newline at end of file diff --git a/fixtures/line_delimited.outline.json b/fixtures/line_delimited.outline.json new file mode 100755 index 0000000..447be50 --- /dev/null +++ b/fixtures/line_delimited.outline.json @@ -0,0 +1,16 @@ +{ + "map": [ + [ + "source_author", + "source.author" + ], + [ + "message_original", + "message.original" + ], + [ + "message_Revised", + "message.Revised" + ] + ] +} \ No newline at end of file diff --git a/fixtures/nested_json_frt.json b/fixtures/nested_json_frt.json new file mode 100755 index 0000000..d7001bd --- /dev/null +++ b/fixtures/nested_json_frt.json @@ -0,0 +1,46 @@ +{ + "fruit": [ + { + "name": "Apple", + "binomial name": "Malus domestica", + "major_producers": [ + "China", + "United States", + "Turkey" + ], + "nutrition": { + "carbohydrates": "13.81g", + "fat": "0.17g", + "protein": "0.26g" + } + }, + { + "name": "Orange", + "binomial name": "Citrus x sinensis", + "major_producers": [ + "Brazil", + "United States", + "India" + ], + "nutrition": { + "carbohydrates": "11.75g", + "fat": "0.12g", + "protein": "0.94g" + } + }, + { + "name": "Mango", + "binomial name": "Mangifera indica", + "major_producers": [ + "India", + "China", + "Thailand" + ], + "nutrition": { + "carbohydrates": "15g", + "fat": "0.38g", + "protein": "0.82g" + } + } + ] +} \ No newline at end of file diff --git a/fixtures/nested_json_frt.outline.json b/fixtures/nested_json_frt.outline.json new file mode 100755 index 0000000..0c2e5da --- /dev/null +++ b/fixtures/nested_json_frt.outline.json @@ -0,0 +1,37 @@ +{ + "map": [ + [ + "name", + "name" + ], + [ + "binomial name", + "binomial name" + ], + [ + "major_producers_0", + "major_producers.0" + ], + [ + "major_producers_1", + "major_producers.1" + ], + [ + "major_producers_2", + "major_producers.2" + ], + [ + "nutrition_carbohydrates", + "nutrition.carbohydrates" + ], + [ + "nutrition_fat", + "nutrition.fat" + ], + [ + "nutrition_protein", + "nutrition.protein" + ] + ], + "collection": "fruit" +} \ No newline at end of file diff --git a/fixtures/outline.json b/fixtures/outline.json old mode 100644 new mode 100755 index 9844e7f..a93ad96 --- a/fixtures/outline.json +++ b/fixtures/outline.json @@ -1,17 +1,17 @@ -{ - "map": [ - [ - "source_author", - "source.author" - ], - [ - "message_Revised", - "message.Revised" - ], - [ - "message_original", - "message.original" - ] - ], - "collection": "nodes" +{ + "collection": "nodes", + "map": [ + [ + "message_Revised", + "message.Revised" + ], + [ + "message_original", + "message.original" + ], + [ + "source_author", + "source.author" + ] + ] } \ No newline at end of file diff --git a/gen_outline.py b/gen_outline.py old mode 100644 new mode 100755 index 3067d6b..51dfb23 --- a/gen_outline.py +++ b/gen_outline.py @@ -39,9 +39,9 @@ def path_join(path, sep='.'): def key_map_to_list(key_map): # We convert to strings *after* sorting so that array indices come out # in the correct order. - # return [(path_join(k, '_'), path_join(k)) for k in sorted(key_map.keys())] + return [(path_join(k, '_'), path_join(k)) for k in sorted(key_map.keys())] # to get keys in the order defined in json ,not doing any sort - return [(path_join(k, '_'), path_join(k)) for k in key_map.keys()] + # return [(path_join(k, '_'), path_join(k)) for k in key_map.keys()] def make_outline(json_file, each_line, collection_key): if each_line: @@ -79,10 +79,9 @@ def main(): if outfile is None: fileName, fileExtension = os.path.splitext(args.json_file.name) outfile = fileName + '.outline.json' -#not soring the json keys +#sort the key , this will make sure that the output file will always have the same order of fields with open(outfile, 'w') as f: - json.dump(outline, f, indent=2, sort_keys=False) + json.dump(outline, f, indent=2, sort_keys=True) if __name__ == '__main__': main() - diff --git a/tests.py b/tests.py index 91b9e44..580a121 100644 --- a/tests.py +++ b/tests.py @@ -140,9 +140,9 @@ def test_basic(self): expected = { 'collection': 'nodes', 'map': [ - ('source_author', 'source.author'), - ('message_original', 'message.original'), ('message_Revised', 'message.Revised'), + ('message_original', 'message.original'), + ('source_author', 'source.author'), ] } self.assertEqual(outline, expected) @@ -170,12 +170,12 @@ def test_different_keys_per_row(self): expected = { 'collection': 'nodes', 'map': [ - ('this', 'this'), ('tags_0', 'tags.0'), ('tags_1', 'tags.1'), ('tags_2', 'tags.2'), ('that', 'that'), ('theother', 'theother'), + ('this', 'this'), ] } self.assertEqual(outline, expected) @@ -185,10 +185,9 @@ def test_line_delimited(self): outline = make_outline(json_file, True, None) expected = { 'map': [ - ('source_author', 'source.author'), - ('message_original', 'message.original'), ('message_Revised', 'message.Revised'), + ('message_original', 'message.original'), + ('source_author', 'source.author'), ] } self.assertEqual(outline, expected) - From f6c0d26b0599bef92b44f1357762a35afbd53bb2 Mon Sep 17 00:00:00 2001 From: Brahmanand Singh Date: Sun, 27 Jun 2021 00:23:45 +0530 Subject: [PATCH 7/7] added output for issue#38 --- fixtures/nested_json_frt.csv | 4 ++++ 1 file changed, 4 insertions(+) create mode 100755 fixtures/nested_json_frt.csv diff --git a/fixtures/nested_json_frt.csv b/fixtures/nested_json_frt.csv new file mode 100755 index 0000000..4719cf4 --- /dev/null +++ b/fixtures/nested_json_frt.csv @@ -0,0 +1,4 @@ +"name","binomial name","major_producers_0","major_producers_1","major_producers_2","nutrition_carbohydrates","nutrition_fat","nutrition_protein" +"Apple","Malus domestica","China","United States","Turkey","13.81g","0.17g","0.26g" +"Orange","Citrus x sinensis","Brazil","United States","India","11.75g","0.12g","0.94g" +"Mango","Mangifera indica","India","China","Thailand","15g","0.38g","0.82g"