Skip to content

Commit

Permalink
to_apify_request fails nicely if request cannot be serialized
Browse files Browse the repository at this point in the history
  • Loading branch information
vdusek committed Mar 6, 2024
1 parent d2393a9 commit c6c7f8a
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 9 deletions.
17 changes: 10 additions & 7 deletions src/apify/scrapy/requests.py
Expand Up @@ -24,21 +24,19 @@ def _is_request_produced_by_middleware(scrapy_request: Request) -> bool:
return bool(scrapy_request.meta.get('redirect_times')) or bool(scrapy_request.meta.get('retry_times'))


def to_apify_request(scrapy_request: Request, spider: Spider) -> dict:
def to_apify_request(scrapy_request: Request, spider: Spider) -> dict | None:
"""Convert a Scrapy request to an Apify request.
Args:
scrapy_request: The Scrapy request to be converted.
spider: The Scrapy spider that the request is associated with.
Raises:
TypeError: If the scrapy_request is not an instance of the scrapy.Request class.
Returns:
The converted Apify request.
The converted Apify request if the conversion was successful, otherwise None.
"""
if not isinstance(scrapy_request, Request):
raise TypeError('scrapy_request must be an instance of the scrapy.Request class')
Actor.log.warn('Failed to convert to Apify request: Scrapy request must be a Request instance.')
return None

call_id = crypto_random_object_id(8)
Actor.log.debug(f'[{call_id}]: to_apify_request was called (scrapy_request={scrapy_request})...')
Expand Down Expand Up @@ -72,7 +70,12 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict:
# and storing it as 'scrapy_request' within the 'userData' dictionary of the apify_request.
# - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/.
scrapy_request_dict = scrapy_request.to_dict(spider=spider)
scrapy_request_dict_encoded = codecs.encode(pickle.dumps(scrapy_request_dict), 'base64').decode()

try:
scrapy_request_dict_encoded = codecs.encode(pickle.dumps(scrapy_request_dict), 'base64').decode()
except TypeError as exc:
Actor.log.warn(f'Failed to convert to Apify request: serialization of the Scrapy request failed: {exc}')
return None

apify_request['userData']['scrapy_request'] = scrapy_request_dict_encoded

Expand Down
4 changes: 4 additions & 0 deletions src/apify/scrapy/scheduler.py
Expand Up @@ -85,6 +85,10 @@ def enqueue_request(self: ApifyScheduler, request: Request) -> bool:
raise TypeError('self.spider must be an instance of the Spider class')

apify_request = to_apify_request(request, spider=self.spider)
if apify_request is None:
Actor.log.warn(f'Request {request} was not enqueued because it could not be converted to Apify request.')
return False

Actor.log.debug(f'[{call_id}]: scrapy_request was transformed to apify_request (apify_request={apify_request})')

if not isinstance(self._rq, RequestQueue):
Expand Down
8 changes: 6 additions & 2 deletions tests/unit/scrapy/requests/test_to_apify_request.py
Expand Up @@ -21,6 +21,7 @@ def test__to_apify_request__simple(spider: Spider) -> None:
scrapy_request = Request(url='https://example.com')

apify_request = to_apify_request(scrapy_request, spider)
assert apify_request is not None
assert apify_request.get('url') == 'https://example.com'

user_data = apify_request.get('userData', {})
Expand All @@ -35,6 +36,7 @@ def test__to_apify_request__headers(spider: Spider) -> None:

apify_request = to_apify_request(scrapy_request, spider)

assert apify_request is not None
assert apify_request['headers'] == dict(scrapy_request_headers.to_unicode_dict())


Expand All @@ -47,6 +49,7 @@ def test__to_apify_request__without_id_and_unique_key(spider: Spider) -> None:

apify_request = to_apify_request(scrapy_request, spider)

assert apify_request is not None
assert apify_request.get('url') == 'https://example.com'
assert apify_request.get('method') == 'GET'

Expand All @@ -70,6 +73,7 @@ def test__to_apify_request__with_id_and_unique_key(spider: Spider) -> None:
)

apify_request = to_apify_request(scrapy_request, spider)
assert apify_request is not None

assert apify_request.get('url') == 'https://example.com'
assert apify_request.get('method') == 'GET'
Expand All @@ -87,5 +91,5 @@ def test__to_apify_request__with_id_and_unique_key(spider: Spider) -> None:
def test__to_apify_request__invalid_scrapy_request(spider: Spider) -> None:
scrapy_request = 'invalid_request'

with pytest.raises(TypeError):
to_apify_request(scrapy_request, spider)
apify_request = to_apify_request(scrapy_request, spider)
assert apify_request is not None

0 comments on commit c6c7f8a

Please sign in to comment.