1
- from typing import Dict
2
- from scrapy .item import Item
1
+ from itemadapter import ItemAdapter , is_item
3
2
4
3
class ItemValidator (object ):
5
4
@@ -25,35 +24,35 @@ def extract_name_fields_item(item):
25
24
return
26
25
27
26
def validate (self , request_response_object , item ):
28
- if ItemValidator .ITEM_COVERAGE_ENABLED :
29
- self .increment_items ()
30
- if isinstance (item , Item ):
27
+ if ItemValidator .ITEM_COVERAGE_ENABLED and is_item (item ):
28
+ try :
29
+ self .increment_items ()
30
+ adapter = ItemAdapter (item )
31
31
item_name = ItemValidator .get_item_name (item )
32
- fields = ItemValidator .get_item_fields (item )
33
- field_keys = fields .keys ()
34
- if isinstance (item , Dict ):
35
- item_name = 'Unknown'
36
- field_keys = item .keys ()
37
- if item_name is not None and field_keys is not None :
38
- domain = request_response_object .get_domain ()
39
- invalid_fields = []
40
- valid_item = True
41
- self .check_item_exists (domain , item_name , field_keys )
42
- self .item_coverage [domain ][item_name ]['num_items' ] += 1
43
- self .increment_total_fields (field_keys )
44
- for k in field_keys :
45
- if (item .get (k ) is not None and item .get (k ) != '' ):
46
- self .item_coverage [domain ][item_name ]['coverage' ][k ] += 1
47
- else :
48
- valid_item = False
49
- self .increment_invalid_fields ()
50
- invalid_fields .append (k )
51
-
52
- if valid_item is False :
53
- self .item_coverage [domain ][item_name ]['num_invalid_items' ] += 1
54
- self .increment_invalid_items ()
55
- if ItemValidator .INVALID_ITEM_URLS_LOGGING_ENABLED and len (invalid_fields ) > 0 :
56
- self .log_invalid_item_url (request_response_object .get_real_url (), item_name , invalid_fields )
32
+ dict_item = adapter .asdict ()
33
+ field_keys = dict_item .keys ()
34
+ if item_name is not None and field_keys is not None :
35
+ domain = request_response_object .get_domain ()
36
+ invalid_fields = []
37
+ valid_item = True
38
+ self .check_item_exists (domain , item_name , field_keys )
39
+ self .item_coverage [domain ][item_name ]['num_items' ] += 1
40
+ self .increment_total_fields (field_keys )
41
+ for k in field_keys :
42
+ if (dict_item .get (k ) is not None and dict_item .get (k ) != '' ):
43
+ self .item_coverage [domain ][item_name ]['coverage' ][k ] += 1
44
+ else :
45
+ valid_item = False
46
+ self .increment_invalid_fields ()
47
+ invalid_fields .append (k )
48
+
49
+ if valid_item is False :
50
+ self .item_coverage [domain ][item_name ]['num_invalid_items' ] += 1
51
+ self .increment_invalid_items ()
52
+ if ItemValidator .INVALID_ITEM_URLS_LOGGING_ENABLED and len (invalid_fields ) > 0 :
53
+ self .log_invalid_item_url (request_response_object .get_real_url (), item_name , invalid_fields )
54
+ except Exception :
55
+ pass
57
56
58
57
59
58
def check_item_exists (self , domain , item_name , field_keys ):
0 commit comments