Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve error handling for to_apify_request serialization failures #191

Merged
merged 2 commits into from Mar 8, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 3 additions & 1 deletion CHANGELOG.md
Expand Up @@ -2,7 +2,9 @@

## [1.6.1](../../releases/tag/v1.6.1) - Unreleased

...
### Fixed

- Improve error handling for `to_apify_request` serialization failures

## [1.6.0](../../releases/tag/v1.6.0) - 2024-02-23

Expand Down
72 changes: 36 additions & 36 deletions src/apify/scrapy/requests.py
Expand Up @@ -24,57 +24,57 @@ def _is_request_produced_by_middleware(scrapy_request: Request) -> bool:
return bool(scrapy_request.meta.get('redirect_times')) or bool(scrapy_request.meta.get('retry_times'))


def to_apify_request(scrapy_request: Request, spider: Spider) -> dict:
def to_apify_request(scrapy_request: Request, spider: Spider) -> dict | None:
"""Convert a Scrapy request to an Apify request.

Args:
scrapy_request: The Scrapy request to be converted.
spider: The Scrapy spider that the request is associated with.

Raises:
TypeError: If the scrapy_request is not an instance of the scrapy.Request class.

Returns:
The converted Apify request.
The converted Apify request if the conversion was successful, otherwise None.
"""
if not isinstance(scrapy_request, Request):
raise TypeError('scrapy_request must be an instance of the scrapy.Request class')
Actor.log.warn('Failed to convert to Apify request: Scrapy request must be a Request instance.')
return None

call_id = crypto_random_object_id(8)
Actor.log.debug(f'[{call_id}]: to_apify_request was called (scrapy_request={scrapy_request})...')

apify_request = {
'url': scrapy_request.url,
'method': scrapy_request.method,
'userData': scrapy_request.meta.get('userData', {}),
}
try:
apify_request = {
'url': scrapy_request.url,
'method': scrapy_request.method,
'userData': scrapy_request.meta.get('userData', {}),
}

if isinstance(scrapy_request.headers, Headers):
apify_request['headers'] = dict(scrapy_request.headers.to_unicode_dict())
else:
Actor.log.warning(
f'scrapy_request.headers is not an instance of the scrapy.http.headers.Headers class, scrapy_request.headers = {scrapy_request.headers}',
)
if isinstance(scrapy_request.headers, Headers):
apify_request['headers'] = dict(scrapy_request.headers.to_unicode_dict())
else:
Actor.log.warning(f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}')

if _is_request_produced_by_middleware(scrapy_request):
apify_request['uniqueKey'] = scrapy_request.url
else:
# Add 'id' to the apify_request
if scrapy_request.meta.get('apify_request_id'):
apify_request['id'] = scrapy_request.meta['apify_request_id']

# Add 'uniqueKey' to the apify_request
if scrapy_request.meta.get('apify_request_unique_key'):
apify_request['uniqueKey'] = scrapy_request.meta['apify_request_unique_key']

# Serialize the Scrapy Request and store it in the apify_request.
# - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64,
# and storing it as 'scrapy_request' within the 'userData' dictionary of the apify_request.
# - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/.
scrapy_request_dict = scrapy_request.to_dict(spider=spider)
scrapy_request_dict_encoded = codecs.encode(pickle.dumps(scrapy_request_dict), 'base64').decode()

apify_request['userData']['scrapy_request'] = scrapy_request_dict_encoded
if _is_request_produced_by_middleware(scrapy_request):
apify_request['uniqueKey'] = scrapy_request.url
else:
# Add 'id' to the apify_request
if scrapy_request.meta.get('apify_request_id'):
apify_request['id'] = scrapy_request.meta['apify_request_id']

# Add 'uniqueKey' to the apify_request
if scrapy_request.meta.get('apify_request_unique_key'):
apify_request['uniqueKey'] = scrapy_request.meta['apify_request_unique_key']

# Serialize the Scrapy Request and store it in the apify_request.
# - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64,
# and storing it as 'scrapy_request' within the 'userData' dictionary of the apify_request.
# - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/.
scrapy_request_dict = scrapy_request.to_dict(spider=spider)
scrapy_request_dict_encoded = codecs.encode(pickle.dumps(scrapy_request_dict), 'base64').decode()
apify_request['userData']['scrapy_request'] = scrapy_request_dict_encoded

except Exception as exc:
Actor.log.warn(f'Conversion of Scrapy request {scrapy_request} to Apify request failed; {exc}')
return None

Actor.log.debug(f'[{call_id}]: scrapy_request was converted to the apify_request={apify_request}')
return apify_request
Expand Down
4 changes: 4 additions & 0 deletions src/apify/scrapy/scheduler.py
Expand Up @@ -85,6 +85,10 @@ def enqueue_request(self: ApifyScheduler, request: Request) -> bool:
raise TypeError('self.spider must be an instance of the Spider class')

apify_request = to_apify_request(request, spider=self.spider)
if apify_request is None:
Actor.log.warn(f'Request {request} was not enqueued because it could not be converted to Apify request.')
return False

Actor.log.debug(f'[{call_id}]: scrapy_request was transformed to apify_request (apify_request={apify_request})')

if not isinstance(self._rq, RequestQueue):
Expand Down
8 changes: 6 additions & 2 deletions tests/unit/scrapy/requests/test_to_apify_request.py
Expand Up @@ -21,6 +21,7 @@ def test__to_apify_request__simple(spider: Spider) -> None:
scrapy_request = Request(url='https://example.com')

apify_request = to_apify_request(scrapy_request, spider)
assert apify_request is not None
assert apify_request.get('url') == 'https://example.com'

user_data = apify_request.get('userData', {})
Expand All @@ -35,6 +36,7 @@ def test__to_apify_request__headers(spider: Spider) -> None:

apify_request = to_apify_request(scrapy_request, spider)

assert apify_request is not None
assert apify_request['headers'] == dict(scrapy_request_headers.to_unicode_dict())


Expand All @@ -47,6 +49,7 @@ def test__to_apify_request__without_id_and_unique_key(spider: Spider) -> None:

apify_request = to_apify_request(scrapy_request, spider)

assert apify_request is not None
assert apify_request.get('url') == 'https://example.com'
assert apify_request.get('method') == 'GET'

Expand All @@ -70,6 +73,7 @@ def test__to_apify_request__with_id_and_unique_key(spider: Spider) -> None:
)

apify_request = to_apify_request(scrapy_request, spider)
assert apify_request is not None

assert apify_request.get('url') == 'https://example.com'
assert apify_request.get('method') == 'GET'
Expand All @@ -87,5 +91,5 @@ def test__to_apify_request__with_id_and_unique_key(spider: Spider) -> None:
def test__to_apify_request__invalid_scrapy_request(spider: Spider) -> None:
scrapy_request = 'invalid_request'

with pytest.raises(TypeError):
to_apify_request(scrapy_request, spider)
apify_request = to_apify_request(scrapy_request, spider)
assert apify_request is None