-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathunicsv.py
149 lines (116 loc) · 4.64 KB
/
unicsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""
This module contains unicode aware replacements for :func:`csv.reader`
and :func:`csv.writer`. It was stolen/extracted from the ``csvkit``
project to allow re-use when the whole ``csvkit`` package isn't
required.
The original implementations were largely copied from
`examples in the csv module documentation <http://docs.python.org/library/csv.html#examples>`_.
"""
import codecs
import csv
import fnmatch
try:
from StringIO import StringIO ## for Python 2
except ImportError:
from io import StringIO ## for Python 3
from exceptions import FieldSizeLimitError
EIGHT_BIT_ENCODINGS = [
'utf-8', 'u8', 'utf', 'utf8', 'latin-1', 'iso-8859-1', 'iso8859-1', '8859',
'cp819', 'latin', 'latin1', 'l1']
class UTF8Recoder(object):
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8.
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode('utf-8')
class UnicodeCSVReader(object):
"""
A CSV reader which will read rows from a file in a given encoding.
"""
def __init__(self, f, encoding='utf-8', maxfieldsize=None, **kwargs):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, **kwargs)
if maxfieldsize:
csv.field_size_limit(maxfieldsize)
def next(self):
try:
row = self.reader.next()
except csv.Error as e:
# Terrible way to test for this exception, but there is no subclass
if fnmatch.fnmatch(str(e), 'field large[rt] than field limit *'):
raise FieldSizeLimitError(csv.field_size_limit())
else:
raise e
return [unicode(s, 'utf-8') for s in row]
def __iter__(self):
return self
@property
def line_num(self):
return self.reader.line_num
class UnicodeCSVWriter(object):
"""
A CSV writer which will write rows to a file in the specified encoding.
NB: Optimized so that eight-bit encodings skip re-encoding. See:
https://github.com/onyxfish/csvkit/issues/175
"""
def __init__(self, f, encoding='utf-8', **kwargs):
self.encoding = encoding
self._eight_bit = (self.encoding.lower().replace('_', '-') in
EIGHT_BIT_ENCODINGS)
if self._eight_bit:
self.writer = csv.writer(f, **kwargs)
else:
# Redirect output to a queue for reencoding
self.queue = StringIO()
self.writer = csv.writer(self.queue, **kwargs)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
if self._eight_bit:
self.writer.writerow([unicode(s if s != None else '')
.encode(self.encoding) for s in row])
else:
self.writer.writerow([unicode(s if s != None else '')
.encode('utf-8') for s in row])
# Fetch UTF-8 output from the queue...
data = self.queue.getvalue()
data = data.decode('utf-8')
# ...and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the file
self.stream.write(data)
# empty the queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
class UnicodeCSVDictReader(csv.DictReader):
"""
Defer almost all implementation to :class:`csv.DictReader`, but wraps our unicode reader instead
of :func:`csv.reader`.
"""
def __init__(self, f, fieldnames=None, restkey=None, restval=None, *args, **kwargs):
reader = UnicodeCSVReader(f, *args, **kwargs)
if 'encoding' in kwargs:
kwargs.pop('encoding')
csv.DictReader.__init__(self, f, fieldnames, restkey, restval, *args, **kwargs)
self.reader = reader
class UnicodeCSVDictWriter(csv.DictWriter):
"""
Defer almost all implementation to :class:`csv.DictWriter`, but wraps our unicode writer instead
of :func:`csv.writer`.
"""
def __init__(self, f, fieldnames, writeheader=False, restval="", extrasaction="raise", *args, **kwds):
self.fieldnames = fieldnames
self.restval = restval
if extrasaction.lower() not in ("raise", "ignore"):
raise ValueError("extrasaction (%s) must be 'raise' or 'ignore'" %
extrasaction)
self.extrasaction = extrasaction
self.writer = UnicodeCSVWriter(f, *args, **kwds)
if writeheader:
self.writerow(dict(zip(self.fieldnames, self.fieldnames)))