Skip to content

Commit 72c2a25

Browse files
add gzip input parameter, add unittest, add test file
1 parent d4f62b8 commit 72c2a25

File tree

4 files changed

+34
-11
lines changed

4 files changed

+34
-11
lines changed

pysenal/io/file.py

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"""
55
import json
66
import os
7+
import gzip
78
from collections import Iterable
89
import configparser
910
from ..utils.logger import get_logger
@@ -79,7 +80,7 @@ def read_lines_lazy(filename, encoding=_ENCODING_UTF8, keep_end=False,
7980
file.close()
8081

8182

82-
def read_file(filename, encoding=_ENCODING_UTF8, default=None):
83+
def read_file(filename, encoding=_ENCODING_UTF8, default=None, is_gzip=False):
8384
"""
8485
wrap open function to read text in file
8586
:param filename: file path
@@ -90,8 +91,14 @@ def read_file(filename, encoding=_ENCODING_UTF8, default=None):
9091
"""
9192
if not os.path.exists(filename) and default is not None:
9293
return default
93-
with open(filename, encoding=encoding) as f:
94-
return f.read()
94+
if not is_gzip:
95+
f = open(filename, encoding=encoding)
96+
else:
97+
f = gzip.open(filename, 'rt', encoding=encoding)
98+
99+
text = f.read()
100+
f.close()
101+
return text
95102

96103

97104
def write_file(filename, data, encoding=_ENCODING_UTF8):
@@ -163,66 +170,79 @@ def write_json(filename, data, serialize_method=None):
163170
json.dump(data, f, ensure_ascii=False, default=serialize_method)
164171

165172

166-
def read_jsonline(filename, encoding=_ENCODING_UTF8, default=None):
173+
def read_jsonline(filename, encoding=_ENCODING_UTF8, default=None, is_gzip=False):
167174
"""
168175
read jsonl file
169176
:param filename: source file path
170177
:param encoding: file encoding
171178
:param default: returned value when filename is not existed.
172179
If it's None, exception will be raised as usual.
180+
:param is_gzip: whether input file is gzip format
173181
:return: object list, an object corresponding a line
174182
"""
175183
if not os.path.exists(filename) and default is not None:
176184
return default
177-
file = open(filename, encoding=encoding)
185+
if not is_gzip:
186+
file = open(filename, encoding=encoding)
187+
else:
188+
file = gzip.open(filename, 'rt', encoding=encoding)
178189
items = []
179190
for line in file:
180191
items.append(json.loads(line))
181192
file.close()
182193
return items
183194

184195

185-
def read_jsonline_lazy(filename, encoding=_ENCODING_UTF8, default=None):
196+
def read_jsonline_lazy(filename, encoding=_ENCODING_UTF8, default=None, is_gzip=False):
186197
"""
187198
use generator to load jsonl one line every time
188199
:param filename: source file path
189200
:param encoding: file encoding
190201
:param default: returned value when filename is not existed.
191202
If it's None, exception will be raised as usual.
203+
:param is_gzip: whether input file is gzip file
192204
:return: json object
193205
"""
194206
if not os.path.exists(filename) and default is not None:
195207
return default
196-
file = open(filename, encoding=encoding)
208+
if not is_gzip:
209+
file = open(filename, encoding=encoding)
210+
else:
211+
file = gzip.open(filename, 'rt', encoding=encoding)
197212
for line in file:
198213
yield json.loads(line)
199214
file.close()
200215

201216

202-
def get_jsonline_chunk_lazy(filename, chunk_size, encoding=_ENCODING_UTF8, default=None):
217+
def get_jsonline_chunk_lazy(filename, chunk_size, encoding=_ENCODING_UTF8,
218+
default=None, is_gzip=False):
203219
"""
204220
use generator to read jsonline items chunk by chunk
205221
:param filename: source jsonline file
206222
:param chunk_size: chunk size
207223
:param encoding: file encoding
208224
:param default: default value to return when file is not existed
225+
:param is_gzip: whether input file is gzip file
209226
:return: chunk of some items
210227
"""
211-
file_generator = read_jsonline_lazy(filename, encoding, default)
228+
file_generator = read_jsonline_lazy(filename, encoding, default, is_gzip)
212229
for chunk in get_chunk(file_generator, chunk_size):
213230
yield chunk
214231

215232

216-
def get_jsonline_chunk(filename, chunk_size, encoding=_ENCODING_UTF8, default=None):
233+
def get_jsonline_chunk(filename, chunk_size, encoding=_ENCODING_UTF8,
234+
default=None, is_gzip=False):
217235
"""
218236
read jsonline items chunk by chunk
219237
:param filename: source jsonline file
220238
:param chunk_size: chunk size
221239
:param encoding: file encoding
222240
:param default: default value to return when file is not existed
241+
:param is_gzip: whether input file is gzip format
223242
:return: chunk of some items
224243
"""
225-
chunk_generator = get_chunk(read_jsonline_lazy(filename, encoding, default), chunk_size)
244+
f = read_jsonline_lazy(filename, encoding, default, is_gzip)
245+
chunk_generator = get_chunk(f, chunk_size)
226246
return list(chunk_generator)
227247

228248

tests/io/test_file.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ def test_read(example_lines, fake_filename):
5959
true_text = '\n'.join(example_lines)
6060
assert text == true_text
6161

62+
assert read_file(filename + '.gz', is_gzip=True) == true_text
6263
assert read_file(fake_filename, default='') == ''
6364
with pytest.raises(FileNotFoundError):
6465
read_file(fake_filename)
@@ -70,10 +71,12 @@ def test_read_json():
7071

7172
def test_read_jsonline(example_json, fake_filename):
7273
assert read_jsonline(TEST_DATA_DIR + 'a.jsonl') == example_json
74+
assert read_jsonline(TEST_DATA_DIR + 'a.jsonl.gz', is_gzip=True) == example_json
7375

7476

7577
def test_read_jsonline_chunk(example_json):
7678
assert get_jsonline_chunk(TEST_DATA_DIR + 'a.jsonl', 2) == [example_json]
79+
assert get_jsonline_chunk(TEST_DATA_DIR + 'a.jsonl.gz', 2, is_gzip=True) == [example_json]
7780
generator = get_jsonline_chunk_lazy(TEST_DATA_DIR + 'a.jsonl', 2)
7881
assert isinstance(generator, types.GeneratorType)
7982
assert list(generator) == [example_json]

tests/test_data/a.jsonl.gz

90 Bytes
Binary file not shown.

tests/test_data/a.txt.gz

72 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)