Skip to content

Commit 274d756

Browse files
committed
[Bug Fix - Added item validation support for dataclasses, fixed timeout]
1 parent 212c447 commit 274d756

File tree

8 files changed

+65
-59
lines changed

8 files changed

+65
-59
lines changed

scrapeops_scrapy/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.3"
1+
__version__ = "0.3.1"

scrapeops_scrapy/core/api.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
class SOPSRequest(object):
99

10-
TIMEOUT = 5
10+
TIMEOUT = 15
1111
RETRY_LIMIT = 3
1212
API_KEY = None
1313
JOB_GROUP_ID = None

scrapeops_scrapy/core/core.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,6 @@ def close_sdk(self, spider=None, reason=None):
3636
self.spider_close_stats(reason=reason, crawler=self.crawler)
3737
self.send_stats(periodic_stats=self._periodic_stats, overall_stats=self._overall_stats, stats_type='finished', reason=reason)
3838
self.close_periodic_monitor()
39-
if self._scrapeops_debug_mode:
40-
self.display_stats()
4139

4240

4341
def request_stats(self, request=None):
@@ -66,7 +64,8 @@ def exception_stats(self, request=None, exception_class=None):
6664
def item_stats(self, signal_type=None, item=None, response=None, spider=None):
6765
if self.sdk_enabled():
6866
request_response_object = RequestResponse(response=response)
69-
self.request_response_middleware.normalise_domain_proxy_data(request_response_object)
67+
if response is not None:
68+
self.request_response_middleware.normalise_domain_proxy_data(request_response_object)
7069
if signal_type == 'item_scraped':
7170
self.item_validation_middleware.validate(request_response_object, item)
7271
self.generate_item_stats(request_response_object, signal=signal_type, response=response)

scrapeops_scrapy/core/model.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ def get_settings(self, spider):
214214
self.spider_settings[key] = value
215215

216216
def include_setting(self, key):
217-
exclusion_terms = ['API_KEY', 'APIKEY', 'SECRET_KEY', 'SECRETKEY']
217+
exclusion_terms = ['API_KEY', 'APIKEY', 'SECRET_KEY', 'SECRETKEY', 'PASSWORD', 'CONNECTION_STRING']
218218
if key in self._scrapeops_settings_exclusion_list:
219219
return False
220220
for term in exclusion_terms:

scrapeops_scrapy/normalizer/request_response.py

+12-11
Original file line numberDiff line numberDiff line change
@@ -63,16 +63,16 @@ def __init__(self):
6363
"""
6464

6565
def get_proxy_name(self):
66-
return self._proxy_name
66+
return self._proxy_name or 'unknown'
6767

6868
def get_proxy_setup(self):
69-
return self._proxy_setup
69+
return self._proxy_setup or 'unknown'
7070

7171
def get_domain(self):
72-
return self._domain
72+
return self._domain or 'unknown'
7373

7474
def get_page_type(self):
75-
return self._page_type
75+
return self._page_type or 'unknown'
7676

7777
def get_proxy_api_name(self):
7878
return self._proxy_api_name
@@ -84,7 +84,7 @@ def get_raw_proxy(self):
8484
return self.raw_proxy_port
8585

8686
def get_real_url(self):
87-
return self._real_url
87+
return self._real_url or 'unknown'
8888

8989
def get_validation_test(self):
9090
return self._validation_test or 'pass'
@@ -193,12 +193,13 @@ class RequestResponse(BaseRequestResponse):
193193
def __init__(self, signal_type=None, request=None, response=None):
194194
BaseRequestResponse.__init__(self)
195195
self.signal_type = signal_type
196-
self.request = response.request if request is None else request
197-
self.raw_url = request.url if response is None else response.url
198-
self.raw_proxy_port = self.request.meta.get('proxy')
199-
self.raw_domain = DomainNormalizer.get_domain(self.raw_url)
200-
self._active_proxy = self._active_porxy_port = False if self.raw_proxy_port is None else True
201-
self.raw_headers = self.request.headers
196+
if request is not None or response is not None:
197+
self.request = response.request if request is None else request
198+
self.raw_url = request.url if response is None else response.url
199+
self.raw_proxy_port = self.request.meta.get('proxy')
200+
self.raw_domain = DomainNormalizer.get_domain(self.raw_url)
201+
self._active_proxy = self._active_porxy_port = False if self.raw_proxy_port is None else True
202+
self.raw_headers = self.request.headers
202203

203204
"""
204205
Domain Normalization

scrapeops_scrapy/stats/logger.py

+12-7
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,12 @@ def generate_response_stats(self, request_response_object, request=None, respons
9898

9999

100100
def generate_item_stats(self, request_response_object, signal=None, response=None):
101-
request = response.request
101+
if response is not None:
102+
request = response.request
103+
request_method = request.method
104+
status = response.status
105+
else:
106+
request_method = status = 'unknown'
102107
proxy_name = request_response_object.get_proxy_name()
103108
proxy_setup = request_response_object.get_proxy_setup()
104109
domain_name = request_response_object.get_domain()
@@ -110,16 +115,16 @@ def generate_item_stats(self, request_response_object, signal=None, response=Non
110115
self.check_periodic_stats()
111116

112117
if signal == 'item_scraped':
113-
self.inc_value(self._periodic_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|items')
114-
self.inc_value(self._overall_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|items')
118+
self.inc_value(self._periodic_stats, f'responses|{request_method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{status}|{validation}|{geo}|{custom_tag}|{custom_signal}|items')
119+
self.inc_value(self._overall_stats, f'responses|{request_method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{status}|{validation}|{geo}|{custom_tag}|{custom_signal}|items')
115120

116121
elif signal == 'item_dropped':
117-
self.inc_value(self._periodic_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|items_dropped')
118-
self.inc_value(self._overall_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|items_dropped')
122+
self.inc_value(self._periodic_stats, f'responses|{request_method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{status}|{validation}|{geo}|{custom_tag}|{custom_signal}|items_dropped')
123+
self.inc_value(self._overall_stats, f'responses|{request_method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{status}|{validation}|{geo}|{custom_tag}|{custom_signal}|items_dropped')
119124

120125
elif signal == 'item_error':
121-
self.inc_value(self._periodic_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|item_errors')
122-
self.inc_value(self._overall_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|item_errors')
126+
self.inc_value(self._periodic_stats, f'responses|{request_method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{status}|{validation}|{geo}|{custom_tag}|{custom_signal}|item_errors')
127+
self.inc_value(self._overall_stats, f'responses|{request_method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{status}|{validation}|{geo}|{custom_tag}|{custom_signal}|item_errors')
123128

124129

125130

scrapeops_scrapy/validators/item_validator.py

+29-30
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
from typing import Dict
2-
from scrapy.item import Item
1+
from itemadapter import ItemAdapter, is_item
32

43
class ItemValidator(object):
54

@@ -25,35 +24,35 @@ def extract_name_fields_item(item):
2524
return
2625

2726
def validate(self, request_response_object, item):
28-
if ItemValidator.ITEM_COVERAGE_ENABLED:
29-
self.increment_items()
30-
if isinstance(item, Item):
27+
if ItemValidator.ITEM_COVERAGE_ENABLED and is_item(item):
28+
try:
29+
self.increment_items()
30+
adapter = ItemAdapter(item)
3131
item_name = ItemValidator.get_item_name(item)
32-
fields = ItemValidator.get_item_fields(item)
33-
field_keys = fields.keys()
34-
if isinstance(item, Dict):
35-
item_name = 'Unknown'
36-
field_keys = item.keys()
37-
if item_name is not None and field_keys is not None:
38-
domain = request_response_object.get_domain()
39-
invalid_fields = []
40-
valid_item = True
41-
self.check_item_exists(domain, item_name, field_keys)
42-
self.item_coverage[domain][item_name]['num_items'] += 1
43-
self.increment_total_fields(field_keys)
44-
for k in field_keys:
45-
if(item.get(k) is not None and item.get(k) != ''):
46-
self.item_coverage[domain][item_name]['coverage'][k] += 1
47-
else:
48-
valid_item = False
49-
self.increment_invalid_fields()
50-
invalid_fields.append(k)
51-
52-
if valid_item is False:
53-
self.item_coverage[domain][item_name]['num_invalid_items'] += 1
54-
self.increment_invalid_items()
55-
if ItemValidator.INVALID_ITEM_URLS_LOGGING_ENABLED and len(invalid_fields) > 0:
56-
self.log_invalid_item_url(request_response_object.get_real_url(), item_name, invalid_fields)
32+
dict_item = adapter.asdict()
33+
field_keys = dict_item.keys()
34+
if item_name is not None and field_keys is not None:
35+
domain = request_response_object.get_domain()
36+
invalid_fields = []
37+
valid_item = True
38+
self.check_item_exists(domain, item_name, field_keys)
39+
self.item_coverage[domain][item_name]['num_items'] += 1
40+
self.increment_total_fields(field_keys)
41+
for k in field_keys:
42+
if(dict_item.get(k) is not None and dict_item.get(k) != ''):
43+
self.item_coverage[domain][item_name]['coverage'][k] += 1
44+
else:
45+
valid_item = False
46+
self.increment_invalid_fields()
47+
invalid_fields.append(k)
48+
49+
if valid_item is False:
50+
self.item_coverage[domain][item_name]['num_invalid_items'] += 1
51+
self.increment_invalid_items()
52+
if ItemValidator.INVALID_ITEM_URLS_LOGGING_ENABLED and len(invalid_fields) > 0:
53+
self.log_invalid_item_url(request_response_object.get_real_url(), item_name, invalid_fields)
54+
except Exception:
55+
pass
5756

5857

5958
def check_item_exists(self, domain, item_name, field_keys):

setup.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from setuptools import setup, find_packages
22

33

4-
VERSION = '0.3'
4+
VERSION = '0.3.1'
55
DESCRIPTION = 'Scrapeops Scrapy SDK, is a monitoring tool for your Scrapy spiders.'
66

77
setup(name='scrapeops_scrapy',
@@ -14,10 +14,11 @@
1414
url="https://github.com/ScrapeOps/scrapeops-scrapy-sdk",
1515
packages=find_packages(),
1616
install_requires=[
17-
"tld==0.12.4",
18-
"requests==2.24.0",
19-
"json5==0.9.5",
20-
"urllib3==1.25.10",
17+
"tld>=0.12.4",
18+
"requests>=2.24.0",
19+
"json5>=0.9.5",
20+
"urllib3>=1.25.10",
21+
"itemadapter>=0.4.0",
2122
],
2223
classifiers=[
2324
"Programming Language :: Python",
@@ -26,6 +27,7 @@
2627
"Programming Language :: Python :: 3.7",
2728
"Programming Language :: Python :: 3.8",
2829
"Programming Language :: Python :: 3.9",
30+
"Programming Language :: Python :: 3.10",
2931
"License :: OSI Approved :: BSD License",
3032
"Operating System :: OS Independent",
3133
"Intended Audience :: Developers",

0 commit comments

Comments
 (0)