Skip to content

Commit

Permalink
Improve error handling for to_apify_request serialization failures
Browse files Browse the repository at this point in the history
  • Loading branch information
vdusek committed Mar 7, 2024
1 parent d2393a9 commit 78b4332
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 39 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Expand Up @@ -2,7 +2,9 @@

## [1.6.1](../../releases/tag/v1.6.1) - Unreleased

...
### Fixed

- Improve error handling for `to_apify_request` serialization failures

## [1.6.0](../../releases/tag/v1.6.0) - 2024-02-23

Expand Down
72 changes: 36 additions & 36 deletions src/apify/scrapy/requests.py
Expand Up @@ -24,57 +24,57 @@ def _is_request_produced_by_middleware(scrapy_request: Request) -> bool:
return bool(scrapy_request.meta.get('redirect_times')) or bool(scrapy_request.meta.get('retry_times'))


def to_apify_request(scrapy_request: Request, spider: Spider) -> dict:
def to_apify_request(scrapy_request: Request, spider: Spider) -> dict | None:
"""Convert a Scrapy request to an Apify request.
Args:
scrapy_request: The Scrapy request to be converted.
spider: The Scrapy spider that the request is associated with.
Raises:
TypeError: If the scrapy_request is not an instance of the scrapy.Request class.
Returns:
The converted Apify request.
The converted Apify request if the conversion was successful, otherwise None.
"""
if not isinstance(scrapy_request, Request):
raise TypeError('scrapy_request must be an instance of the scrapy.Request class')
Actor.log.warn('Failed to convert to Apify request: Scrapy request must be a Request instance.')
return None

call_id = crypto_random_object_id(8)
Actor.log.debug(f'[{call_id}]: to_apify_request was called (scrapy_request={scrapy_request})...')

apify_request = {
'url': scrapy_request.url,
'method': scrapy_request.method,
'userData': scrapy_request.meta.get('userData', {}),
}
try:
apify_request = {
'url': scrapy_request.url,
'method': scrapy_request.method,
'userData': scrapy_request.meta.get('userData', {}),
}

if isinstance(scrapy_request.headers, Headers):
apify_request['headers'] = dict(scrapy_request.headers.to_unicode_dict())
else:
Actor.log.warning(
f'scrapy_request.headers is not an instance of the scrapy.http.headers.Headers class, scrapy_request.headers = {scrapy_request.headers}',
)
if isinstance(scrapy_request.headers, Headers):
apify_request['headers'] = dict(scrapy_request.headers.to_unicode_dict())
else:
Actor.log.warning(f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}')

if _is_request_produced_by_middleware(scrapy_request):
apify_request['uniqueKey'] = scrapy_request.url
else:
# Add 'id' to the apify_request
if scrapy_request.meta.get('apify_request_id'):
apify_request['id'] = scrapy_request.meta['apify_request_id']

# Add 'uniqueKey' to the apify_request
if scrapy_request.meta.get('apify_request_unique_key'):
apify_request['uniqueKey'] = scrapy_request.meta['apify_request_unique_key']

# Serialize the Scrapy Request and store it in the apify_request.
# - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64,
# and storing it as 'scrapy_request' within the 'userData' dictionary of the apify_request.
# - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/.
scrapy_request_dict = scrapy_request.to_dict(spider=spider)
scrapy_request_dict_encoded = codecs.encode(pickle.dumps(scrapy_request_dict), 'base64').decode()

apify_request['userData']['scrapy_request'] = scrapy_request_dict_encoded
if _is_request_produced_by_middleware(scrapy_request):
apify_request['uniqueKey'] = scrapy_request.url
else:
# Add 'id' to the apify_request
if scrapy_request.meta.get('apify_request_id'):
apify_request['id'] = scrapy_request.meta['apify_request_id']

# Add 'uniqueKey' to the apify_request
if scrapy_request.meta.get('apify_request_unique_key'):
apify_request['uniqueKey'] = scrapy_request.meta['apify_request_unique_key']

# Serialize the Scrapy Request and store it in the apify_request.
# - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64,
# and storing it as 'scrapy_request' within the 'userData' dictionary of the apify_request.
# - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/.
scrapy_request_dict = scrapy_request.to_dict(spider=spider)
scrapy_request_dict_encoded = codecs.encode(pickle.dumps(scrapy_request_dict), 'base64').decode()
apify_request['userData']['scrapy_request'] = scrapy_request_dict_encoded

except Exception as exc:
Actor.log.warn(f'Conversion of Scrapy request {scrapy_request} to Apify request failed; {exc}')
return None

Actor.log.debug(f'[{call_id}]: scrapy_request was converted to the apify_request={apify_request}')
return apify_request
Expand Down
4 changes: 4 additions & 0 deletions src/apify/scrapy/scheduler.py
Expand Up @@ -85,6 +85,10 @@ def enqueue_request(self: ApifyScheduler, request: Request) -> bool:
raise TypeError('self.spider must be an instance of the Spider class')

apify_request = to_apify_request(request, spider=self.spider)
if apify_request is None:
Actor.log.warn(f'Request {request} was not enqueued because it could not be converted to Apify request.')
return False

Actor.log.debug(f'[{call_id}]: scrapy_request was transformed to apify_request (apify_request={apify_request})')

if not isinstance(self._rq, RequestQueue):
Expand Down
8 changes: 6 additions & 2 deletions tests/unit/scrapy/requests/test_to_apify_request.py
Expand Up @@ -21,6 +21,7 @@ def test__to_apify_request__simple(spider: Spider) -> None:
scrapy_request = Request(url='https://example.com')

apify_request = to_apify_request(scrapy_request, spider)
assert apify_request is not None
assert apify_request.get('url') == 'https://example.com'

user_data = apify_request.get('userData', {})
Expand All @@ -35,6 +36,7 @@ def test__to_apify_request__headers(spider: Spider) -> None:

apify_request = to_apify_request(scrapy_request, spider)

assert apify_request is not None
assert apify_request['headers'] == dict(scrapy_request_headers.to_unicode_dict())


Expand All @@ -47,6 +49,7 @@ def test__to_apify_request__without_id_and_unique_key(spider: Spider) -> None:

apify_request = to_apify_request(scrapy_request, spider)

assert apify_request is not None
assert apify_request.get('url') == 'https://example.com'
assert apify_request.get('method') == 'GET'

Expand All @@ -70,6 +73,7 @@ def test__to_apify_request__with_id_and_unique_key(spider: Spider) -> None:
)

apify_request = to_apify_request(scrapy_request, spider)
assert apify_request is not None

assert apify_request.get('url') == 'https://example.com'
assert apify_request.get('method') == 'GET'
Expand All @@ -87,5 +91,5 @@ def test__to_apify_request__with_id_and_unique_key(spider: Spider) -> None:
def test__to_apify_request__invalid_scrapy_request(spider: Spider) -> None:
scrapy_request = 'invalid_request'

with pytest.raises(TypeError):
to_apify_request(scrapy_request, spider)
apify_request = to_apify_request(scrapy_request, spider)
assert apify_request is None

0 comments on commit 78b4332

Please sign in to comment.