1
1
from pathlib import Path , PurePosixPath , PureWindowsPath
2
2
from collections .abc import Mapping
3
3
from tqdm import tqdm
4
+ import logging
4
5
from .settings import config
5
6
from .errors import DataJointError , MissingExternalFile
6
7
from .hash import uuid_from_buffer , uuid_from_file
10
11
from . import s3
11
12
from .utils import safe_write , safe_copy
12
13
14
+ logger = logging .getLogger (__name__ .split ("." )[0 ])
15
+
13
16
CACHE_SUBFOLDING = (
14
17
2 ,
15
18
2 ,
@@ -72,9 +75,7 @@ def definition(self):
72
75
73
76
@property
74
77
def table_name (self ):
75
- return "{external_table_root}_{store}" .format (
76
- external_table_root = EXTERNAL_TABLE_ROOT , store = self .store
77
- )
78
+ return f"{ EXTERNAL_TABLE_ROOT } _{ self .store } "
78
79
79
80
@property
80
81
def s3 (self ):
@@ -276,9 +277,7 @@ def upload_filepath(self, local_filepath):
276
277
# the tracking entry exists, check that it's the same file as before
277
278
if contents_hash != check_hash [0 ]:
278
279
raise DataJointError (
279
- "A different version of '{file}' has already been placed." .format (
280
- file = relative_filepath
281
- )
280
+ f"A different version of '{ relative_filepath } ' has already been placed."
282
281
)
283
282
else :
284
283
# upload the file and create its tracking entry
@@ -304,27 +303,43 @@ def download_filepath(self, filepath_hash):
304
303
:param filepath_hash: The hash (UUID) of the relative_path
305
304
:return: hash (UUID) of the contents of the downloaded file or Nones
306
305
"""
306
+
307
+ def _need_checksum (local_filepath , expected_size ):
308
+ limit = config .get ("filepath_checksum_size_limit" )
309
+ actual_size = Path (local_filepath ).stat ().st_size
310
+ if expected_size != actual_size :
311
+ # this should never happen without outside interference
312
+ raise DataJointError (
313
+ f"'{ local_filepath } ' downloaded but size did not match."
314
+ )
315
+ return limit is None or actual_size < limit
316
+
307
317
if filepath_hash is not None :
308
- relative_filepath , contents_hash = ( self & { "hash" : filepath_hash }). fetch1 (
309
- "filepath" , "contents_hash"
310
- )
318
+ relative_filepath , contents_hash , size = (
319
+ self & { "hash" : filepath_hash }
320
+ ). fetch1 ( "filepath" , "contents_hash" , "size" )
311
321
external_path = self ._make_external_filepath (relative_filepath )
312
322
local_filepath = Path (self .spec ["stage" ]).absolute () / relative_filepath
313
- file_exists = (
314
- Path (local_filepath ).is_file ()
315
- and uuid_from_file (local_filepath ) == contents_hash
323
+
324
+ file_exists = Path (local_filepath ).is_file () and (
325
+ not _need_checksum (local_filepath , size )
326
+ or uuid_from_file (local_filepath ) == contents_hash
316
327
)
328
+
317
329
if not file_exists :
318
330
self ._download_file (external_path , local_filepath )
319
- checksum = uuid_from_file (local_filepath )
320
331
if (
321
- checksum != contents_hash
322
- ): # this should never happen without outside interference
332
+ _need_checksum (local_filepath , size )
333
+ and uuid_from_file (local_filepath ) != contents_hash
334
+ ):
335
+ # this should never happen without outside interference
323
336
raise DataJointError (
324
- "'{file}' downloaded but did not pass checksum'" .format (
325
- file = local_filepath
326
- )
337
+ f"'{ local_filepath } ' downloaded but did not pass checksum."
327
338
)
339
+ if not _need_checksum (local_filepath , size ):
340
+ logger .warning (
341
+ f"Skipped checksum for file with hash: { contents_hash } , and path: { local_filepath } "
342
+ )
328
343
return str (local_filepath ), contents_hash
329
344
330
345
# --- UTILITIES ---
@@ -402,7 +417,7 @@ def delete(
402
417
delete_external_files = None ,
403
418
limit = None ,
404
419
display_progress = True ,
405
- errors_as_string = True
420
+ errors_as_string = True ,
406
421
):
407
422
"""
408
423
0 commit comments