diff --git a/fixtures/data.outline.json b/fixtures/data.outline.json new file mode 100755 index 0000000..a93ad96 --- /dev/null +++ b/fixtures/data.outline.json @@ -0,0 +1,17 @@ +{ + "collection": "nodes", + "map": [ + [ + "message_Revised", + "message.Revised" + ], + [ + "message_original", + "message.original" + ], + [ + "source_author", + "source.author" + ] + ] +} \ No newline at end of file diff --git a/fixtures/different_keys_per_row.outline.json b/fixtures/different_keys_per_row.outline.json new file mode 100755 index 0000000..c502506 --- /dev/null +++ b/fixtures/different_keys_per_row.outline.json @@ -0,0 +1,29 @@ +{ + "map": [ + [ + "this", + "this" + ], + [ + "tags_0", + "tags.0" + ], + [ + "tags_1", + "tags.1" + ], + [ + "tags_2", + "tags.2" + ], + [ + "that", + "that" + ], + [ + "theother", + "theother" + ] + ], + "collection": "nodes" +} \ No newline at end of file diff --git a/fixtures/line_delimited.outline.json b/fixtures/line_delimited.outline.json new file mode 100755 index 0000000..447be50 --- /dev/null +++ b/fixtures/line_delimited.outline.json @@ -0,0 +1,16 @@ +{ + "map": [ + [ + "source_author", + "source.author" + ], + [ + "message_original", + "message.original" + ], + [ + "message_Revised", + "message.Revised" + ] + ] +} \ No newline at end of file diff --git a/fixtures/nested_json_frt.csv b/fixtures/nested_json_frt.csv new file mode 100755 index 0000000..4719cf4 --- /dev/null +++ b/fixtures/nested_json_frt.csv @@ -0,0 +1,4 @@ +"name","binomial name","major_producers_0","major_producers_1","major_producers_2","nutrition_carbohydrates","nutrition_fat","nutrition_protein" +"Apple","Malus domestica","China","United States","Turkey","13.81g","0.17g","0.26g" +"Orange","Citrus x sinensis","Brazil","United States","India","11.75g","0.12g","0.94g" +"Mango","Mangifera indica","India","China","Thailand","15g","0.38g","0.82g" diff --git a/fixtures/nested_json_frt.json b/fixtures/nested_json_frt.json new file mode 100755 index 0000000..d7001bd --- /dev/null +++ b/fixtures/nested_json_frt.json @@ -0,0 +1,46 @@ +{ + "fruit": [ + { + "name": "Apple", + "binomial name": "Malus domestica", + "major_producers": [ + "China", + "United States", + "Turkey" + ], + "nutrition": { + "carbohydrates": "13.81g", + "fat": "0.17g", + "protein": "0.26g" + } + }, + { + "name": "Orange", + "binomial name": "Citrus x sinensis", + "major_producers": [ + "Brazil", + "United States", + "India" + ], + "nutrition": { + "carbohydrates": "11.75g", + "fat": "0.12g", + "protein": "0.94g" + } + }, + { + "name": "Mango", + "binomial name": "Mangifera indica", + "major_producers": [ + "India", + "China", + "Thailand" + ], + "nutrition": { + "carbohydrates": "15g", + "fat": "0.38g", + "protein": "0.82g" + } + } + ] +} \ No newline at end of file diff --git a/fixtures/nested_json_frt.outline.json b/fixtures/nested_json_frt.outline.json new file mode 100755 index 0000000..0c2e5da --- /dev/null +++ b/fixtures/nested_json_frt.outline.json @@ -0,0 +1,37 @@ +{ + "map": [ + [ + "name", + "name" + ], + [ + "binomial name", + "binomial name" + ], + [ + "major_producers_0", + "major_producers.0" + ], + [ + "major_producers_1", + "major_producers.1" + ], + [ + "major_producers_2", + "major_producers.2" + ], + [ + "nutrition_carbohydrates", + "nutrition.carbohydrates" + ], + [ + "nutrition_fat", + "nutrition.fat" + ], + [ + "nutrition_protein", + "nutrition.protein" + ] + ], + "collection": "fruit" +} \ No newline at end of file diff --git a/fixtures/outline.json b/fixtures/outline.json old mode 100644 new mode 100755 index b8d09b8..a93ad96 --- a/fixtures/outline.json +++ b/fixtures/outline.json @@ -1,7 +1,17 @@ -{ - "map": [ - ["author", "source.author"], - ["message", "message.original"] - ], - "collection": "nodes" -} +{ + "collection": "nodes", + "map": [ + [ + "message_Revised", + "message.Revised" + ], + [ + "message_original", + "message.original" + ], + [ + "source_author", + "source.author" + ] + ] +} \ No newline at end of file diff --git a/gen_outline.py b/gen_outline.py index 8ab20aa..51dfb23 100755 --- a/gen_outline.py +++ b/gen_outline.py @@ -6,7 +6,7 @@ def key_paths(d): def helper(path, x): if isinstance(x, dict): - for k, v in x.iteritems(): + for k, v in x.items(): for ret in helper(path + [k], v): yield ret elif isinstance(x, list): @@ -40,6 +40,8 @@ def key_map_to_list(key_map): # We convert to strings *after* sorting so that array indices come out # in the correct order. return [(path_join(k, '_'), path_join(k)) for k in sorted(key_map.keys())] + # to get keys in the order defined in json ,not doing any sort + # return [(path_join(k, '_'), path_join(k)) for k in key_map.keys()] def make_outline(json_file, each_line, collection_key): if each_line: @@ -77,7 +79,7 @@ def main(): if outfile is None: fileName, fileExtension = os.path.splitext(args.json_file.name) outfile = fileName + '.outline.json' - +#sort the key , this will make sure that the output file will always have the same order of fields with open(outfile, 'w') as f: json.dump(outline, f, indent=2, sort_keys=True) diff --git a/json2csv.py b/json2csv.py index e70ef12..685a5f4 100755 --- a/json2csv.py +++ b/json2csv.py @@ -1,15 +1,15 @@ #!/usr/bin/env python -try: - import unicodecsv as csv -except ImportError: - import csv - +#python3 is by default unicode +import csv import json import operator import os from collections import OrderedDict import logging +import argparse +#reduce is part of functools for py3 +import functools as ft logging.basicConfig(level=logging.DEBUG) @@ -59,7 +59,7 @@ def process_each(self, data): data = data[self.collection] for d in data: - logging.info(d) + #logging.info(d) self.rows.append(self.process_row(d)) def process_row(self, item): @@ -69,7 +69,7 @@ def process_row(self, item): for header, keys in self.key_map.items(): try: - row[header] = reduce(operator.getitem, keys, item) + row[header] = ft.reduce(operator.getitem, keys, item) except (KeyError, IndexError, TypeError): row[header] = None @@ -88,7 +88,7 @@ def make_string(self, item): elif isinstance(item, dict): return self.DICT_OPEN + self.DICT_SEP_CHAR.join([self.KEY_VAL_CHAR.join([k, self.make_string(val)]) for k, val in item.items()]) + self.DICT_CLOSE else: - return unicode(item) + return item def write_csv(self, filename='output.csv', make_strings=False): """Write the processed rows to the given filename @@ -99,8 +99,9 @@ def write_csv(self, filename='output.csv', make_strings=False): out = self.make_strings() else: out = self.rows - with open(filename, 'wb+') as f: - writer = csv.DictWriter(f, self.key_map.keys()) + #opening with write mode only and specifying unix dilect to quote all fields + with open(filename, 'w') as f: + writer = csv.DictWriter(f, self.key_map.keys(), dialect='unix') writer.writeheader() writer.writerows(out) @@ -119,7 +120,6 @@ def process_each(self, data, collection=None): def init_parser(): - import argparse parser = argparse.ArgumentParser(description="Converts JSON to CSV") parser.add_argument('json_file', type=argparse.FileType('r'), help="Path to JSON data file to load") @@ -153,3 +153,4 @@ def init_parser(): outfile = fileName + '.csv' loader.write_csv(filename=outfile, make_strings=args.strings) + diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 4e2f0a7..0000000 --- a/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -unicodecsv==0.9.0 diff --git a/tests.py b/tests.py index 821e4c3..580a121 100644 --- a/tests.py +++ b/tests.py @@ -26,8 +26,8 @@ def test_process_row(self): self.assertIn('id', row.keys()) self.assertIn('count', row.keys()) - self.assertEquals(row['id'], 'Someone') - self.assertEquals(row['count'], 1) + self.assertEqual(row['id'], 'Someone') + self.assertEqual(row['count'], 1) def test_process_row_nested_data(self): """Ensure that nested keys (with . notation) are processed""" @@ -42,8 +42,8 @@ def test_process_row_nested_data(self): self.assertIn('author', row.keys()) self.assertIn('message', row.keys()) - self.assertEquals(row['author'], 'Someone') - self.assertEquals(row['message'], 'Hey!') + self.assertEqual(row['author'], 'Someone') + self.assertEqual(row['message'], 'Hey!') def test_process_row_array_index(self): """Ensure that array indices are properly handled as part of the dot notation""" @@ -56,14 +56,14 @@ def test_process_each(self): test_data = json.loads('{"result":[{"_id" : "Someone","count" : 1}]}') loader.process_each(test_data) - self.assertEquals(len(loader.rows), 1) + self.assertEqual(len(loader.rows), 1) row = loader.rows[0] self.assertIs(type(row), dict) self.assertIn('id', row.keys()) self.assertIn('count', row.keys()) - self.assertEquals(row['id'], 'Someone') - self.assertEquals(row['count'], 1) + self.assertEqual(row['id'], 'Someone') + self.assertEqual(row['count'], 1) def test_process_each_optional_key(self): """Ensure a key that is not always present won't prevent data extraction @@ -75,12 +75,12 @@ def test_process_each_optional_key(self): test_data = json.loads('''[ {"_id": "Someone","count": 1, "tags": ["super"]}, {"_id": "Another", "tags": []}]''') - self.assertEquals(len(test_data), 2) + self.assertEqual(len(test_data), 2) loader.process_each(test_data) - self.assertEquals(len(loader.rows), 2) + self.assertEqual(len(loader.rows), 2) second_row = loader.rows[1] - self.assertEquals(second_row['id'], 'Another') + self.assertEqual(second_row['id'], 'Another') # works for missing dict keys self.assertIsNone(second_row['count']) # and missing list indices