This repository was archived by the owner on Nov 27, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 63
/
Copy pathfile.py
235 lines (197 loc) · 8.07 KB
/
file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
import os
from collections import OrderedDict
class WrongFormatError(Exception):
pass
class EmptyFileError(Exception):
pass
class BrokenFormatError(Exception):
pass
class BrokenReaderError(Exception):
pass
class OptionalImportError(Exception):
pass
try: #Python3
FileNotFoundError=FileNotFoundError
except NameError: # Python2
FileNotFoundError = IOError
class File(OrderedDict):
def __init__(self,filename=None,**kwargs):
if filename:
self.read(filename, **kwargs)
else:
self.filename = None
def read(self, filename=None, **kwargs):
if filename:
self.filename = filename
if not self.filename:
raise Exception('No filename provided')
if not os.path.isfile(self.filename):
raise OSError(2,'File not found:',self.filename)
if os.stat(self.filename).st_size == 0:
raise EmptyFileError('File is empty:',self.filename)
# Calling children function
self._read(**kwargs)
def write(self, filename=None, **kwargs):
if filename:
self.filename = filename
if not self.filename:
raise Exception('No filename provided')
# Calling children function
self._write(**kwargs)
def toDataFrame(self):
return self._toDataFrame()
# --------------------------------------------------------------------------------
# --- Properties
# --------------------------------------------------------------------------------
@property
def size(self):
return os.path.getsize(self.filename)
@property
def encoding(self):
import codecs
import chardet
""" Detects encoding"""
try:
byts = min(32, self.size)
except TypeError:
return None
with open(self.filename, 'rb') as f:
raw = f.read(byts)
if raw.startswith(codecs.BOM_UTF8):
return 'utf-8-sig'
else:
result = chardet.detect(raw)
return result['encoding']
# --------------------------------------------------------------------------------}
# --- Conversions
# --------------------------------------------------------------------------------{
def toCSV(self, filename=None, extension='.csv', **kwargs):
""" By default, writes dataframes to CSV
Keywords arguments are the same as pandas DataFrame to_csv:
- sep: separator
- index: write index or not
- etc.
"""
from .converters import writeFileDataFrames
from .converters import dataFrameToCSV
writeFileDataFrames(self, dataFrameToCSV, filename=filename, extension=extension, **kwargs)
def toOUTB(self, filename=None, extension='.outb', **kwargs):
""" write to OUTB"""
from .converters import writeFileDataFrames
from .converters import dataFrameToOUTB
writeFileDataFrames(self, dataFrameToOUTB, filename=filename, extension=extension, **kwargs)
def toParquet(self, filename=None, extension='.parquet', **kwargs):
""" write to OUTB"""
from .converters import writeFileDataFrames
from .converters import dataFrameToParquet
writeFileDataFrames(self, dataFrameToParquet, filename=filename, extension=extension, **kwargs)
# --------------------------------------------------------------------------------
# --- Sub class methods
# --------------------------------------------------------------------------------
def _read(self,**kwargs):
raise NotImplementedError("Method must be implemented in the subclass")
def _write(self):
raise NotImplementedError("Method must be implemented in the subclass")
def _toDataFrame(self):
raise NotImplementedError("Method must be implemented in the subclass")
def _fromDataFrame(self):
raise NotImplementedError("Method must be implemented in the subclass")
def _fromDictionary(self):
raise NotImplementedError("Method must be implemented in the subclass")
def _fromFile(self):
raise NotImplementedError("Method must be implemented in the subclass")
# --------------------------------------------------------------------------------
# --- Static methods
# --------------------------------------------------------------------------------
@staticmethod
def defaultExtension():
raise NotImplementedError("Method must be implemented in the subclass")
@staticmethod
def formatName():
raise NotImplementedError("Method must be implemented in the subclass")
def test_write_read(self,bDelete=False):
""" Test that we can write and then read what we wrote
NOTE: this does not check that what we read is the same..
"""
# --- First, test write function (assuming read)
try:
f,ext=os.path.splitext(self.filename)
filename_out = f+'_TMP'+ext
self.write(filename_out)
except Exception as e:
raise Exception('Error writing what we read\n'+e.args[0])
# --- Second, re-read what we wrote
try:
self.read(filename_out)
except Exception as e:
raise Exception('Error reading what we wrote\n'+e.args[0])
if bDelete:
os.remove(filename_out)
return filename_out
def test_ascii(self,bCompareWritesOnly=False,bDelete=True):
# compare ourselves (assuming read has occured) with what we write
f,ext=os.path.splitext(self.filename)
# --- Perform a simple write/read test
filename_out=self.test_write_read()
# --- Perform ascii comparison (and delete if success)
if bCompareWritesOnly:
f1 = filename_out
f2 = f+'_TMP2'+ext
self.write(f2)
else:
f1 = f+ext
f2 = filename_out
bStat=ascii_comp(f1,f2,bDelete=bDelete)
if bStat:
if bCompareWritesOnly and bDelete:
os.remove(f1)
else:
raise Exception('The ascii content of {} and {} are different'.format(f1,f2))
# --------------------------------------------------------------------------------}
# --- Helper functions
# --------------------------------------------------------------------------------{
def isBinary(filename):
with open(filename, 'r') as f:
try:
# first try to read as string
l = f.readline()
# then look for weird characters
for c in l:
code = ord(c)
if code<10 or (code>14 and code<31):
return True
return False
except UnicodeDecodeError:
return True
def numberOfLines(filename, method=1):
if method==1:
return sum(1 for i in open(filename, 'rb'))
elif method==2:
def blocks(files, size=65536):
while True:
b = files.read(size)
if not b: break
yield b
with open(filename, "r",encoding="utf-8",errors='ignore') as f:
return sum(bl.count("\n") for bl in blocks(f))
else:
raise NotImplementedError()
def ascii_comp(file1,file2,bDelete=False):
""" Compares two ascii files line by line.
Comparison is done ignoring multiple white spaces for now"""
# --- Read original as ascii
with open(file1, 'r') as f1:
lines1 = f1.read().splitlines();
lines1 = '|'.join([l.replace('\t',' ').strip() for l in lines1])
lines1 = ' '.join(lines1.split())
# --- Read second file as ascii
with open(file2, 'r') as f2:
lines2 = f2.read().splitlines();
lines2 = '|'.join([l.replace('\t',' ').strip() for l in lines2])
lines2 = ' '.join(lines2.split())
if lines1 == lines2:
if bDelete:
os.remove(file2)
return True
else:
return False