Fixes conversion of userData and headers fields in Apify-Scrapy r…

…equest translation (#179)
apify · Jan 23, 2024 · 1c68f62 · 1c68f62
1 parent 6708f6f
commit 1c68f62
Show file tree

Hide file tree

Showing 12 changed files with 387 additions and 302 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,7 +2,13 @@
 
 ## [1.5.4](../../releases/tag/v1.5.4) - Unreleased
 
-...
+### Added
+
+- Add support for `headers` field in Apify <-> Scrapy request translation
+
+### Fixed
+
+- Fix conversion of `userData` field in Apify <-> Scrapy request translation
 
 ## [1.5.3](../../releases/tag/v1.5.3) - 2024-01-23
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -135,6 +135,7 @@ indent-style = "space"
 "**/{tests}/*" = [
     "D",       # Everything from the pydocstyle
     "INP001",  # File {filename} is part of an implicit namespace package, add an __init__.py
+    "PT011",   # `pytest.raises({ExceptionType})` is too broad, set the `match` parameter or use a more specific exception
     "PLR2004", # Magic value used in comparison, consider replacing {value} with a constant variable
     "S101",    # Use of assert detected
     "T20",     # flake8-print

diff --git a/src/apify/scrapy/__init__.py b/src/apify/scrapy/__init__.py
@@ -1,2 +1,3 @@
+from .requests import to_apify_request, to_scrapy_request
 from .scheduler import ApifyScheduler
-from .utils import get_basic_auth_header, get_running_event_loop_id, open_queue_with_custom_client, to_apify_request, to_scrapy_request
+from .utils import get_basic_auth_header, get_running_event_loop_id, open_queue_with_custom_client
diff --git a/src/apify/scrapy/middlewares/apify_retry.py b/src/apify/scrapy/middlewares/apify_retry.py
@@ -14,7 +14,8 @@
     ) from exc
 
 from apify.actor import Actor
-from apify.scrapy.utils import nested_event_loop, open_queue_with_custom_client, to_apify_request
+from apify.scrapy.requests import to_apify_request
+from apify.scrapy.utils import nested_event_loop, open_queue_with_custom_client
 
 if TYPE_CHECKING:
     from apify.storages import RequestQueue

diff --git a/src/apify/scrapy/requests.py b/src/apify/scrapy/requests.py
@@ -0,0 +1,139 @@
+from __future__ import annotations
+
+import codecs
+import pickle
+
+try:
+    from scrapy import Request, Spider
+    from scrapy.utils.request import request_from_dict
+except ImportError as exc:
+    raise ImportError(
+        'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
+    ) from exc
+
+from apify._crypto import crypto_random_object_id
+from apify.actor import Actor
+
+
+def to_apify_request(scrapy_request: Request, spider: Spider) -> dict:
+    """Convert a Scrapy request to an Apify request.
+
+    Args:
+        scrapy_request: The Scrapy request to be converted.
+        spider: The Scrapy spider that the request is associated with.
+
+    Raises:
+        TypeError: If the scrapy_request is not an instance of the scrapy.Request class.
+
+    Returns:
+        The converted Apify request.
+    """
+    if not isinstance(scrapy_request, Request):
+        raise TypeError('scrapy_request must be an instance of the scrapy.Request class')
+
+    call_id = crypto_random_object_id(8)
+    Actor.log.debug(f'[{call_id}]: to_apify_request was called (scrapy_request={scrapy_request})...')
+
+    apify_request = {
+        'url': scrapy_request.url,
+        'method': scrapy_request.method,
+        'headers': scrapy_request.headers,
+        'userData': scrapy_request.meta.get('userData', {}),
+    }
+
+    # Add 'id' to the apify_request
+    if scrapy_request.meta.get('apify_request_id'):
+        apify_request['id'] = scrapy_request.meta['apify_request_id']
+
+    # Add 'uniqueKey' to the apify_request
+    if scrapy_request.meta.get('apify_request_unique_key'):
+        apify_request['uniqueKey'] = scrapy_request.meta['apify_request_unique_key']
+
+    # Serialize the Scrapy Request and store it in the apify_request.
+    #   - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64,
+    #     and storing it as 'scrapy_request' within the 'userData' dictionary of the apify_request.
+    #   - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/.
+    scrapy_request_dict = scrapy_request.to_dict(spider=spider)
+    scrapy_request_dict_encoded = codecs.encode(pickle.dumps(scrapy_request_dict), 'base64').decode()
+
+    apify_request['userData']['scrapy_request'] = scrapy_request_dict_encoded
+
+    Actor.log.debug(f'[{call_id}]: scrapy_request was converted to the apify_request={apify_request}')
+    return apify_request
+
+
+def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
+    """Convert an Apify request to a Scrapy request.
+
+    Args:
+        apify_request: The Apify request to be converted.
+        spider: The Scrapy spider that the request is associated with.
+
+    Raises:
+        TypeError: If the apify_request is not a dictionary.
+        ValueError: If the apify_request does not contain the required keys.
+
+    Returns:
+        The converted Scrapy request.
+    """
+    if not isinstance(apify_request, dict):
+        raise TypeError('apify_request must be a dictionary')
+
+    required_keys = ['url', 'method', 'id', 'uniqueKey']
+    missing_keys = [key for key in required_keys if key not in apify_request]
+
+    if missing_keys:
+        raise ValueError(f'apify_request must contain {", ".join(map(repr, missing_keys))} key(s)')
+
+    call_id = crypto_random_object_id(8)
+    Actor.log.debug(f'[{call_id}]: to_scrapy_request was called (apify_request={apify_request})...')
+
+    # If the apify_request comes from the Scrapy
+    if 'userData' in apify_request and 'scrapy_request' in apify_request['userData']:
+        # Deserialize the Scrapy Request from the apify_request.
+        #   - This process involves decoding the base64-encoded request data and reconstructing
+        #     the Scrapy Request object from its dictionary representation.
+        Actor.log.debug(f'[{call_id}]: Restoring the Scrapy Request from the apify_request...')
+
+        scrapy_request_dict_encoded = apify_request['userData']['scrapy_request']
+        if not isinstance(scrapy_request_dict_encoded, str):
+            raise TypeError('scrapy_request_dict_encoded must be a string')
+
+        scrapy_request_dict = pickle.loads(codecs.decode(scrapy_request_dict_encoded.encode(), 'base64'))
+        if not isinstance(scrapy_request_dict, dict):
+            raise TypeError('scrapy_request_dict must be a dictionary')
+
+        scrapy_request = request_from_dict(scrapy_request_dict, spider=spider)
+        if not isinstance(scrapy_request, Request):
+            raise TypeError('scrapy_request must be an instance of the Request class')
+
+        Actor.log.debug(f'[{call_id}]: Scrapy Request successfully reconstructed (scrapy_request={scrapy_request})...')
+
+        # Update the meta field with the meta field from the apify_request
+        meta = scrapy_request.meta or {}
+        meta.update({'apify_request_id': apify_request['id'], 'apify_request_unique_key': apify_request['uniqueKey']})
+        scrapy_request._meta = meta  # scrapy_request.meta is a property, so we have to set it like this
+
+    # If the apify_request comes directly from the Request Queue, typically start URLs
+    else:
+        Actor.log.debug(f'[{call_id}]: gonna create a new Scrapy Request (cannot be restored)')
+
+        scrapy_request = Request(
+            url=apify_request['url'],
+            method=apify_request['method'],
+            meta={
+                'apify_request_id': apify_request['id'],
+                'apify_request_unique_key': apify_request['uniqueKey'],
+            },
+        )
+
+    # Add optional 'headers' field
+    if 'headers' in apify_request:
+        scrapy_request.headers = apify_request['headers']
+
+    # Add optional 'userData' field
+    if 'userData' in apify_request:
+        scrapy_request.meta['userData'] = apify_request['userData']
+
+    Actor.log.debug(f'[{call_id}]: an apify_request was converted to the scrapy_request={scrapy_request}')
+    return scrapy_request
diff --git a/src/apify/scrapy/scheduler.py b/src/apify/scrapy/scheduler.py
@@ -14,7 +14,8 @@
 
 from apify._crypto import crypto_random_object_id
 from apify.actor import Actor
-from apify.scrapy.utils import nested_event_loop, open_queue_with_custom_client, to_apify_request, to_scrapy_request
+from apify.scrapy.requests import to_apify_request, to_scrapy_request
+from apify.scrapy.utils import nested_event_loop, open_queue_with_custom_client
 from apify.storages import RequestQueue
 
 

diff --git a/src/apify/scrapy/utils.py b/src/apify/scrapy/utils.py
@@ -1,23 +1,18 @@
 from __future__ import annotations
 
 import asyncio
-import codecs
-import pickle
 from base64 import b64encode
 from urllib.parse import unquote
 
 try:
-    from scrapy import Request, Spider
     from scrapy.settings import Settings  # noqa: TCH002
     from scrapy.utils.project import get_project_settings
     from scrapy.utils.python import to_bytes
-    from scrapy.utils.request import request_from_dict
 except ImportError as exc:
     raise ImportError(
         'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
     ) from exc
 
-from apify._crypto import crypto_random_object_id
 from apify.actor import Actor
 from apify.storages import RequestQueue, StorageClientManager
 
@@ -42,119 +37,6 @@ def get_running_event_loop_id() -> int:
     return id(asyncio.get_running_loop())
 
 
-def to_apify_request(scrapy_request: Request, spider: Spider) -> dict:
-    """Convert a Scrapy request to an Apify request.
-
-    Args:
-        scrapy_request: The Scrapy request to be converted.
-        spider: The Scrapy spider that the request is associated with.
-
-    Raises:
-        TypeError: If the scrapy_request is not an instance of the scrapy.Request class.
-
-    Returns:
-        The converted Apify request.
-    """
-    if not isinstance(scrapy_request, Request):
-        raise TypeError('scrapy_request must be an instance of the scrapy.Request class')
-
-    call_id = crypto_random_object_id(8)
-    Actor.log.debug(f'[{call_id}]: to_apify_request was called (scrapy_request={scrapy_request})...')
-
-    apify_request = {
-        'url': scrapy_request.url,
-        'method': scrapy_request.method,
-    }
-
-    # Add 'id' to the apify_request
-    if scrapy_request.meta.get('apify_request_id'):
-        apify_request['id'] = scrapy_request.meta['apify_request_id']
-
-    # Add 'uniqueKey' to the apify_request
-    if scrapy_request.meta.get('apify_request_unique_key'):
-        apify_request['uniqueKey'] = scrapy_request.meta['apify_request_unique_key']
-
-    # Serialize the Scrapy Request and store it in the apify_request.
-    #   - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64,
-    #     and storing it as 'scrapy_request' within the 'userData' dictionary of the apify_request.
-    #   - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/.
-    scrapy_request_dict = scrapy_request.to_dict(spider=spider)
-    scrapy_request_dict_encoded = codecs.encode(pickle.dumps(scrapy_request_dict), 'base64').decode()
-    apify_request['userData'] = {'scrapy_request': scrapy_request_dict_encoded}
-
-    Actor.log.debug(f'[{call_id}]: scrapy_request was converted to the apify_request={apify_request}')
-    return apify_request
-
-
-def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
-    """Convert an Apify request to a Scrapy request.
-
-    Args:
-        apify_request: The Apify request to be converted.
-        spider: The Scrapy spider that the request is associated with.
-
-    Raises:
-        TypeError: If the apify_request is not a dictionary.
-        ValueError: If the apify_request does not contain the required keys.
-
-    Returns:
-        The converted Scrapy request.
-    """
-    if not isinstance(apify_request, dict):
-        raise TypeError('apify_request must be a dictionary')
-
-    required_keys = ['url', 'method', 'id', 'uniqueKey']
-    missing_keys = [key for key in required_keys if key not in apify_request]
-
-    if missing_keys:
-        raise ValueError(f'apify_request must contain {", ".join(map(repr, missing_keys))} key(s)')
-
-    call_id = crypto_random_object_id(8)
-    Actor.log.debug(f'[{call_id}]: to_scrapy_request was called (apify_request={apify_request})...')
-
-    # If the apify_request comes from the Scrapy
-    if 'userData' in apify_request and 'scrapy_request' in apify_request['userData']:
-        # Deserialize the Scrapy Request from the apify_request.
-        #   - This process involves decoding the base64-encoded request data and reconstructing
-        #     the Scrapy Request object from its dictionary representation.
-        Actor.log.debug(f'[{call_id}]: Restoring the Scrapy Request from the apify_request...')
-
-        scrapy_request_dict_encoded = apify_request['userData']['scrapy_request']
-        if not isinstance(scrapy_request_dict_encoded, str):
-            raise TypeError('scrapy_request_dict_encoded must be a string')
-
-        scrapy_request_dict = pickle.loads(codecs.decode(scrapy_request_dict_encoded.encode(), 'base64'))
-        if not isinstance(scrapy_request_dict, dict):
-            raise TypeError('scrapy_request_dict must be a dictionary')
-
-        scrapy_request = request_from_dict(scrapy_request_dict, spider=spider)
-        if not isinstance(scrapy_request, Request):
-            raise TypeError('scrapy_request must be an instance of the Request class')
-
-        Actor.log.debug(f'[{call_id}]: Scrapy Request successfully reconstructed (scrapy_request={scrapy_request})...')
-
-        # Update the meta field with the meta field from the apify_request
-        meta = scrapy_request.meta or {}
-        meta.update({'apify_request_id': apify_request['id'], 'apify_request_unique_key': apify_request['uniqueKey']})
-        scrapy_request._meta = meta  # scrapy_request.meta is a property, so we have to set it like this
-
-    # If the apify_request comes directly from the Request Queue, typically start URLs
-    else:
-        Actor.log.debug(f'[{call_id}]: gonna create a new Scrapy Request (cannot be restored)')
-
-        scrapy_request = Request(
-            url=apify_request['url'],
-            method=apify_request['method'],
-            meta={
-                'apify_request_id': apify_request['id'],
-                'apify_request_unique_key': apify_request['uniqueKey'],
-            },
-        )
-
-    Actor.log.debug(f'[{call_id}]: an apify_request was converted to the scrapy_request={scrapy_request}')
-    return scrapy_request
-
-
 def apply_apify_settings(*, settings: Settings | None = None, proxy_config: dict | None = None) -> Settings:
     """Integrates Apify configuration into a Scrapy project settings.
 

diff --git a/tests/unit/scrapy/requests/__init__.py b/tests/unit/scrapy/requests/__init__.py