Skip to content

Commit 07c5553

Browse files
Merge pull request #1025 from jverswijver/remove_checksum
Add option to disable checksums for external files and blobs
2 parents 8f9e81c + cc803b4 commit 07c5553

File tree

5 files changed

+71
-25
lines changed

5 files changed

+71
-25
lines changed

CHANGELOG.md

+5-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
## Release notes
22

33
### 0.13.6 -- Jun 13, 2022
4-
* Add - unified package level logger for package (#667) PR #1031
5-
* Update - swap various datajoint messages, warnings, etc. to use the new logger. (#667) PR #1031
4+
* Add - Config option to set threshold for when to stop using checksums for filepath stores. PR #1025
5+
* Add - Unified package level logger for package (#667) PR #1031
6+
* Update - Swap various datajoint messages, warnings, etc. to use the new logger. (#667) PR #1031
7+
* Bugfix - Fix query caching deleting non-datajoint files PR #1027
8+
* Update - Minimum Python version for Datajoint-Python is now 3.7 PR #1027
69

710
### 0.13.5 -- May 19, 2022
811
* Update - Import ABC from collections.abc for Python 3.10 compatibility

datajoint/external.py

+34-19
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from pathlib import Path, PurePosixPath, PureWindowsPath
22
from collections.abc import Mapping
33
from tqdm import tqdm
4+
import logging
45
from .settings import config
56
from .errors import DataJointError, MissingExternalFile
67
from .hash import uuid_from_buffer, uuid_from_file
@@ -10,6 +11,8 @@
1011
from . import s3
1112
from .utils import safe_write, safe_copy
1213

14+
logger = logging.getLogger(__name__.split(".")[0])
15+
1316
CACHE_SUBFOLDING = (
1417
2,
1518
2,
@@ -72,9 +75,7 @@ def definition(self):
7275

7376
@property
7477
def table_name(self):
75-
return "{external_table_root}_{store}".format(
76-
external_table_root=EXTERNAL_TABLE_ROOT, store=self.store
77-
)
78+
return f"{EXTERNAL_TABLE_ROOT}_{self.store}"
7879

7980
@property
8081
def s3(self):
@@ -276,9 +277,7 @@ def upload_filepath(self, local_filepath):
276277
# the tracking entry exists, check that it's the same file as before
277278
if contents_hash != check_hash[0]:
278279
raise DataJointError(
279-
"A different version of '{file}' has already been placed.".format(
280-
file=relative_filepath
281-
)
280+
f"A different version of '{relative_filepath}' has already been placed."
282281
)
283282
else:
284283
# upload the file and create its tracking entry
@@ -304,27 +303,43 @@ def download_filepath(self, filepath_hash):
304303
:param filepath_hash: The hash (UUID) of the relative_path
305304
:return: hash (UUID) of the contents of the downloaded file or Nones
306305
"""
306+
307+
def _need_checksum(local_filepath, expected_size):
308+
limit = config.get("filepath_checksum_size_limit")
309+
actual_size = Path(local_filepath).stat().st_size
310+
if expected_size != actual_size:
311+
# this should never happen without outside interference
312+
raise DataJointError(
313+
f"'{local_filepath}' downloaded but size did not match."
314+
)
315+
return limit is None or actual_size < limit
316+
307317
if filepath_hash is not None:
308-
relative_filepath, contents_hash = (self & {"hash": filepath_hash}).fetch1(
309-
"filepath", "contents_hash"
310-
)
318+
relative_filepath, contents_hash, size = (
319+
self & {"hash": filepath_hash}
320+
).fetch1("filepath", "contents_hash", "size")
311321
external_path = self._make_external_filepath(relative_filepath)
312322
local_filepath = Path(self.spec["stage"]).absolute() / relative_filepath
313-
file_exists = (
314-
Path(local_filepath).is_file()
315-
and uuid_from_file(local_filepath) == contents_hash
323+
324+
file_exists = Path(local_filepath).is_file() and (
325+
not _need_checksum(local_filepath, size)
326+
or uuid_from_file(local_filepath) == contents_hash
316327
)
328+
317329
if not file_exists:
318330
self._download_file(external_path, local_filepath)
319-
checksum = uuid_from_file(local_filepath)
320331
if (
321-
checksum != contents_hash
322-
): # this should never happen without outside interference
332+
_need_checksum(local_filepath, size)
333+
and uuid_from_file(local_filepath) != contents_hash
334+
):
335+
# this should never happen without outside interference
323336
raise DataJointError(
324-
"'{file}' downloaded but did not pass checksum'".format(
325-
file=local_filepath
326-
)
337+
f"'{local_filepath}' downloaded but did not pass checksum."
327338
)
339+
if not _need_checksum(local_filepath, size):
340+
logger.warning(
341+
f"Skipped checksum for file with hash: {contents_hash}, and path: {local_filepath}"
342+
)
328343
return str(local_filepath), contents_hash
329344

330345
# --- UTILITIES ---
@@ -402,7 +417,7 @@ def delete(
402417
delete_external_files=None,
403418
limit=None,
404419
display_progress=True,
405-
errors_as_string=True
420+
errors_as_string=True,
406421
):
407422
"""
408423

datajoint/settings.py

+1
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
"display.show_tuple_count": True,
4747
"database.use_tls": None,
4848
"enable_python_native_blobs": True, # python-native/dj0 encoding support
49+
"filepath_checksum_size_limit": None, # file size limit for when to disable checksums
4950
}
5051
)
5152

docs-parts/intro/Releases_lang1.rst

+5-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
0.13.6 -- Jun 13, 2022
22
----------------------
3-
* Add - unified package level logger for package (#667) PR #1031
4-
* Update - swap various datajoint messages, warnings, etc. to use the new logger. (#667) PR #1031
3+
* Add - Config option to set threshold for when to stop using checksums for filepath stores. PR #1025
4+
* Add - Unified package level logger for package (#667) PR #1031
5+
* Update - Swap various datajoint messages, warnings, etc. to use the new logger. (#667) PR #1031
6+
* Bugfix - Fix query caching deleting non-datajoint files PR #1027
7+
* Update - Minimum Python version for Datajoint-Python is now 3.7 PR #1027
58

69
0.13.5 -- May 19, 2022
710
----------------------

tests/test_filepath.py

+26-2
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,11 @@
33
import os
44
from pathlib import Path
55
import random
6-
76
from .schema_external import schema, Filepath, FilepathS3, stores_config
7+
import logging
8+
import io
9+
10+
logger = logging.getLogger("datajoint")
811

912

1013
def setUp(self):
@@ -132,7 +135,9 @@ def test_duplicate_error_s3():
132135
test_duplicate_error(store="repo-s3")
133136

134137

135-
def test_filepath_class(table=Filepath(), store="repo"):
138+
def test_filepath_class(table=Filepath(), store="repo", verify_checksum=True):
139+
if not verify_checksum:
140+
dj.config["filepath_checksum_size_limit"] = 0
136141
dj.errors._switch_filepath_types(True)
137142
stage_path = dj.config["stores"][store]["stage"]
138143
# create a mock file
@@ -169,6 +174,7 @@ def test_filepath_class(table=Filepath(), store="repo"):
169174
# delete from external table
170175
table.external[store].delete(delete_external_files=True)
171176
dj.errors._switch_filepath_types(False)
177+
dj.config["filepath_checksum_size_limit"] = None
172178

173179

174180
def test_filepath_class_again():
@@ -185,6 +191,24 @@ def test_filepath_class_s3_again():
185191
test_filepath_class(FilepathS3(), "repo-s3")
186192

187193

194+
def test_filepath_class_no_checksum():
195+
log_capture = io.StringIO()
196+
stream_handler = logging.StreamHandler(log_capture)
197+
log_format = logging.Formatter(
198+
"[%(asctime)s][%(funcName)s][%(levelname)s]: %(message)s"
199+
)
200+
stream_handler.setFormatter(log_format)
201+
stream_handler.set_name("test_limit_warning")
202+
logger.addHandler(stream_handler)
203+
test_filepath_class(verify_checksum=False)
204+
log_contents = log_capture.getvalue()
205+
log_capture.close()
206+
for handler in logger.handlers: # Clean up handler
207+
if handler.name == "test_limit_warning":
208+
logger.removeHandler(handler)
209+
assert "Skipped checksum for file with hash:" in log_contents
210+
211+
188212
def test_filepath_cleanup(table=Filepath(), store="repo"):
189213
"""test deletion of filepath entries from external table"""
190214

0 commit comments

Comments
 (0)