4
4
"""
5
5
import json
6
6
import os
7
+ import gzip
7
8
from collections import Iterable
8
9
import configparser
9
10
from ..utils .logger import get_logger
@@ -79,7 +80,7 @@ def read_lines_lazy(filename, encoding=_ENCODING_UTF8, keep_end=False,
79
80
file .close ()
80
81
81
82
82
- def read_file (filename , encoding = _ENCODING_UTF8 , default = None ):
83
+ def read_file (filename , encoding = _ENCODING_UTF8 , default = None , is_gzip = False ):
83
84
"""
84
85
wrap open function to read text in file
85
86
:param filename: file path
@@ -90,8 +91,14 @@ def read_file(filename, encoding=_ENCODING_UTF8, default=None):
90
91
"""
91
92
if not os .path .exists (filename ) and default is not None :
92
93
return default
93
- with open (filename , encoding = encoding ) as f :
94
- return f .read ()
94
+ if not is_gzip :
95
+ f = open (filename , encoding = encoding )
96
+ else :
97
+ f = gzip .open (filename , 'rt' , encoding = encoding )
98
+
99
+ text = f .read ()
100
+ f .close ()
101
+ return text
95
102
96
103
97
104
def write_file (filename , data , encoding = _ENCODING_UTF8 ):
@@ -163,66 +170,79 @@ def write_json(filename, data, serialize_method=None):
163
170
json .dump (data , f , ensure_ascii = False , default = serialize_method )
164
171
165
172
166
- def read_jsonline (filename , encoding = _ENCODING_UTF8 , default = None ):
173
+ def read_jsonline (filename , encoding = _ENCODING_UTF8 , default = None , is_gzip = False ):
167
174
"""
168
175
read jsonl file
169
176
:param filename: source file path
170
177
:param encoding: file encoding
171
178
:param default: returned value when filename is not existed.
172
179
If it's None, exception will be raised as usual.
180
+ :param is_gzip: whether input file is gzip format
173
181
:return: object list, an object corresponding a line
174
182
"""
175
183
if not os .path .exists (filename ) and default is not None :
176
184
return default
177
- file = open (filename , encoding = encoding )
185
+ if not is_gzip :
186
+ file = open (filename , encoding = encoding )
187
+ else :
188
+ file = gzip .open (filename , 'rt' , encoding = encoding )
178
189
items = []
179
190
for line in file :
180
191
items .append (json .loads (line ))
181
192
file .close ()
182
193
return items
183
194
184
195
185
- def read_jsonline_lazy (filename , encoding = _ENCODING_UTF8 , default = None ):
196
+ def read_jsonline_lazy (filename , encoding = _ENCODING_UTF8 , default = None , is_gzip = False ):
186
197
"""
187
198
use generator to load jsonl one line every time
188
199
:param filename: source file path
189
200
:param encoding: file encoding
190
201
:param default: returned value when filename is not existed.
191
202
If it's None, exception will be raised as usual.
203
+ :param is_gzip: whether input file is gzip file
192
204
:return: json object
193
205
"""
194
206
if not os .path .exists (filename ) and default is not None :
195
207
return default
196
- file = open (filename , encoding = encoding )
208
+ if not is_gzip :
209
+ file = open (filename , encoding = encoding )
210
+ else :
211
+ file = gzip .open (filename , 'rt' , encoding = encoding )
197
212
for line in file :
198
213
yield json .loads (line )
199
214
file .close ()
200
215
201
216
202
- def get_jsonline_chunk_lazy (filename , chunk_size , encoding = _ENCODING_UTF8 , default = None ):
217
+ def get_jsonline_chunk_lazy (filename , chunk_size , encoding = _ENCODING_UTF8 ,
218
+ default = None , is_gzip = False ):
203
219
"""
204
220
use generator to read jsonline items chunk by chunk
205
221
:param filename: source jsonline file
206
222
:param chunk_size: chunk size
207
223
:param encoding: file encoding
208
224
:param default: default value to return when file is not existed
225
+ :param is_gzip: whether input file is gzip file
209
226
:return: chunk of some items
210
227
"""
211
- file_generator = read_jsonline_lazy (filename , encoding , default )
228
+ file_generator = read_jsonline_lazy (filename , encoding , default , is_gzip )
212
229
for chunk in get_chunk (file_generator , chunk_size ):
213
230
yield chunk
214
231
215
232
216
- def get_jsonline_chunk (filename , chunk_size , encoding = _ENCODING_UTF8 , default = None ):
233
+ def get_jsonline_chunk (filename , chunk_size , encoding = _ENCODING_UTF8 ,
234
+ default = None , is_gzip = False ):
217
235
"""
218
236
read jsonline items chunk by chunk
219
237
:param filename: source jsonline file
220
238
:param chunk_size: chunk size
221
239
:param encoding: file encoding
222
240
:param default: default value to return when file is not existed
241
+ :param is_gzip: whether input file is gzip format
223
242
:return: chunk of some items
224
243
"""
225
- chunk_generator = get_chunk (read_jsonline_lazy (filename , encoding , default ), chunk_size )
244
+ f = read_jsonline_lazy (filename , encoding , default , is_gzip )
245
+ chunk_generator = get_chunk (f , chunk_size )
226
246
return list (chunk_generator )
227
247
228
248
0 commit comments