Skip to content

Commit

Permalink
Fixes conversion of userData and headers fields in Apify-Scrapy r…
Browse files Browse the repository at this point in the history
…equest translation (#179)
  • Loading branch information
vdusek committed Jan 23, 2024
1 parent 6708f6f commit 1c68f62
Show file tree
Hide file tree
Showing 12 changed files with 387 additions and 302 deletions.
8 changes: 7 additions & 1 deletion CHANGELOG.md
Expand Up @@ -2,7 +2,13 @@

## [1.5.4](../../releases/tag/v1.5.4) - Unreleased

...
### Added

- Add support for `headers` field in Apify <-> Scrapy request translation

### Fixed

- Fix conversion of `userData` field in Apify <-> Scrapy request translation

## [1.5.3](../../releases/tag/v1.5.3) - 2024-01-23

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Expand Up @@ -135,6 +135,7 @@ indent-style = "space"
"**/{tests}/*" = [
"D", # Everything from the pydocstyle
"INP001", # File {filename} is part of an implicit namespace package, add an __init__.py
"PT011", # `pytest.raises({ExceptionType})` is too broad, set the `match` parameter or use a more specific exception
"PLR2004", # Magic value used in comparison, consider replacing {value} with a constant variable
"S101", # Use of assert detected
"T20", # flake8-print
Expand Down
3 changes: 2 additions & 1 deletion src/apify/scrapy/__init__.py
@@ -1,2 +1,3 @@
from .requests import to_apify_request, to_scrapy_request
from .scheduler import ApifyScheduler
from .utils import get_basic_auth_header, get_running_event_loop_id, open_queue_with_custom_client, to_apify_request, to_scrapy_request
from .utils import get_basic_auth_header, get_running_event_loop_id, open_queue_with_custom_client
3 changes: 2 additions & 1 deletion src/apify/scrapy/middlewares/apify_retry.py
Expand Up @@ -14,7 +14,8 @@
) from exc

from apify.actor import Actor
from apify.scrapy.utils import nested_event_loop, open_queue_with_custom_client, to_apify_request
from apify.scrapy.requests import to_apify_request
from apify.scrapy.utils import nested_event_loop, open_queue_with_custom_client

if TYPE_CHECKING:
from apify.storages import RequestQueue
Expand Down
139 changes: 139 additions & 0 deletions src/apify/scrapy/requests.py
@@ -0,0 +1,139 @@
from __future__ import annotations

import codecs
import pickle

try:
from scrapy import Request, Spider
from scrapy.utils.request import request_from_dict
except ImportError as exc:
raise ImportError(
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
) from exc

from apify._crypto import crypto_random_object_id
from apify.actor import Actor


def to_apify_request(scrapy_request: Request, spider: Spider) -> dict:
"""Convert a Scrapy request to an Apify request.
Args:
scrapy_request: The Scrapy request to be converted.
spider: The Scrapy spider that the request is associated with.
Raises:
TypeError: If the scrapy_request is not an instance of the scrapy.Request class.
Returns:
The converted Apify request.
"""
if not isinstance(scrapy_request, Request):
raise TypeError('scrapy_request must be an instance of the scrapy.Request class')

call_id = crypto_random_object_id(8)
Actor.log.debug(f'[{call_id}]: to_apify_request was called (scrapy_request={scrapy_request})...')

apify_request = {
'url': scrapy_request.url,
'method': scrapy_request.method,
'headers': scrapy_request.headers,
'userData': scrapy_request.meta.get('userData', {}),
}

# Add 'id' to the apify_request
if scrapy_request.meta.get('apify_request_id'):
apify_request['id'] = scrapy_request.meta['apify_request_id']

# Add 'uniqueKey' to the apify_request
if scrapy_request.meta.get('apify_request_unique_key'):
apify_request['uniqueKey'] = scrapy_request.meta['apify_request_unique_key']

# Serialize the Scrapy Request and store it in the apify_request.
# - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64,
# and storing it as 'scrapy_request' within the 'userData' dictionary of the apify_request.
# - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/.
scrapy_request_dict = scrapy_request.to_dict(spider=spider)
scrapy_request_dict_encoded = codecs.encode(pickle.dumps(scrapy_request_dict), 'base64').decode()

apify_request['userData']['scrapy_request'] = scrapy_request_dict_encoded

Actor.log.debug(f'[{call_id}]: scrapy_request was converted to the apify_request={apify_request}')
return apify_request


def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
"""Convert an Apify request to a Scrapy request.
Args:
apify_request: The Apify request to be converted.
spider: The Scrapy spider that the request is associated with.
Raises:
TypeError: If the apify_request is not a dictionary.
ValueError: If the apify_request does not contain the required keys.
Returns:
The converted Scrapy request.
"""
if not isinstance(apify_request, dict):
raise TypeError('apify_request must be a dictionary')

required_keys = ['url', 'method', 'id', 'uniqueKey']
missing_keys = [key for key in required_keys if key not in apify_request]

if missing_keys:
raise ValueError(f'apify_request must contain {", ".join(map(repr, missing_keys))} key(s)')

call_id = crypto_random_object_id(8)
Actor.log.debug(f'[{call_id}]: to_scrapy_request was called (apify_request={apify_request})...')

# If the apify_request comes from the Scrapy
if 'userData' in apify_request and 'scrapy_request' in apify_request['userData']:
# Deserialize the Scrapy Request from the apify_request.
# - This process involves decoding the base64-encoded request data and reconstructing
# the Scrapy Request object from its dictionary representation.
Actor.log.debug(f'[{call_id}]: Restoring the Scrapy Request from the apify_request...')

scrapy_request_dict_encoded = apify_request['userData']['scrapy_request']
if not isinstance(scrapy_request_dict_encoded, str):
raise TypeError('scrapy_request_dict_encoded must be a string')

scrapy_request_dict = pickle.loads(codecs.decode(scrapy_request_dict_encoded.encode(), 'base64'))
if not isinstance(scrapy_request_dict, dict):
raise TypeError('scrapy_request_dict must be a dictionary')

scrapy_request = request_from_dict(scrapy_request_dict, spider=spider)
if not isinstance(scrapy_request, Request):
raise TypeError('scrapy_request must be an instance of the Request class')

Actor.log.debug(f'[{call_id}]: Scrapy Request successfully reconstructed (scrapy_request={scrapy_request})...')

# Update the meta field with the meta field from the apify_request
meta = scrapy_request.meta or {}
meta.update({'apify_request_id': apify_request['id'], 'apify_request_unique_key': apify_request['uniqueKey']})
scrapy_request._meta = meta # scrapy_request.meta is a property, so we have to set it like this

# If the apify_request comes directly from the Request Queue, typically start URLs
else:
Actor.log.debug(f'[{call_id}]: gonna create a new Scrapy Request (cannot be restored)')

scrapy_request = Request(
url=apify_request['url'],
method=apify_request['method'],
meta={
'apify_request_id': apify_request['id'],
'apify_request_unique_key': apify_request['uniqueKey'],
},
)

# Add optional 'headers' field
if 'headers' in apify_request:
scrapy_request.headers = apify_request['headers']

# Add optional 'userData' field
if 'userData' in apify_request:
scrapy_request.meta['userData'] = apify_request['userData']

Actor.log.debug(f'[{call_id}]: an apify_request was converted to the scrapy_request={scrapy_request}')
return scrapy_request
3 changes: 2 additions & 1 deletion src/apify/scrapy/scheduler.py
Expand Up @@ -14,7 +14,8 @@

from apify._crypto import crypto_random_object_id
from apify.actor import Actor
from apify.scrapy.utils import nested_event_loop, open_queue_with_custom_client, to_apify_request, to_scrapy_request
from apify.scrapy.requests import to_apify_request, to_scrapy_request
from apify.scrapy.utils import nested_event_loop, open_queue_with_custom_client
from apify.storages import RequestQueue


Expand Down
118 changes: 0 additions & 118 deletions src/apify/scrapy/utils.py
@@ -1,23 +1,18 @@
from __future__ import annotations

import asyncio
import codecs
import pickle
from base64 import b64encode
from urllib.parse import unquote

try:
from scrapy import Request, Spider
from scrapy.settings import Settings # noqa: TCH002
from scrapy.utils.project import get_project_settings
from scrapy.utils.python import to_bytes
from scrapy.utils.request import request_from_dict
except ImportError as exc:
raise ImportError(
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
) from exc

from apify._crypto import crypto_random_object_id
from apify.actor import Actor
from apify.storages import RequestQueue, StorageClientManager

Expand All @@ -42,119 +37,6 @@ def get_running_event_loop_id() -> int:
return id(asyncio.get_running_loop())


def to_apify_request(scrapy_request: Request, spider: Spider) -> dict:
"""Convert a Scrapy request to an Apify request.
Args:
scrapy_request: The Scrapy request to be converted.
spider: The Scrapy spider that the request is associated with.
Raises:
TypeError: If the scrapy_request is not an instance of the scrapy.Request class.
Returns:
The converted Apify request.
"""
if not isinstance(scrapy_request, Request):
raise TypeError('scrapy_request must be an instance of the scrapy.Request class')

call_id = crypto_random_object_id(8)
Actor.log.debug(f'[{call_id}]: to_apify_request was called (scrapy_request={scrapy_request})...')

apify_request = {
'url': scrapy_request.url,
'method': scrapy_request.method,
}

# Add 'id' to the apify_request
if scrapy_request.meta.get('apify_request_id'):
apify_request['id'] = scrapy_request.meta['apify_request_id']

# Add 'uniqueKey' to the apify_request
if scrapy_request.meta.get('apify_request_unique_key'):
apify_request['uniqueKey'] = scrapy_request.meta['apify_request_unique_key']

# Serialize the Scrapy Request and store it in the apify_request.
# - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64,
# and storing it as 'scrapy_request' within the 'userData' dictionary of the apify_request.
# - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/.
scrapy_request_dict = scrapy_request.to_dict(spider=spider)
scrapy_request_dict_encoded = codecs.encode(pickle.dumps(scrapy_request_dict), 'base64').decode()
apify_request['userData'] = {'scrapy_request': scrapy_request_dict_encoded}

Actor.log.debug(f'[{call_id}]: scrapy_request was converted to the apify_request={apify_request}')
return apify_request


def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
"""Convert an Apify request to a Scrapy request.
Args:
apify_request: The Apify request to be converted.
spider: The Scrapy spider that the request is associated with.
Raises:
TypeError: If the apify_request is not a dictionary.
ValueError: If the apify_request does not contain the required keys.
Returns:
The converted Scrapy request.
"""
if not isinstance(apify_request, dict):
raise TypeError('apify_request must be a dictionary')

required_keys = ['url', 'method', 'id', 'uniqueKey']
missing_keys = [key for key in required_keys if key not in apify_request]

if missing_keys:
raise ValueError(f'apify_request must contain {", ".join(map(repr, missing_keys))} key(s)')

call_id = crypto_random_object_id(8)
Actor.log.debug(f'[{call_id}]: to_scrapy_request was called (apify_request={apify_request})...')

# If the apify_request comes from the Scrapy
if 'userData' in apify_request and 'scrapy_request' in apify_request['userData']:
# Deserialize the Scrapy Request from the apify_request.
# - This process involves decoding the base64-encoded request data and reconstructing
# the Scrapy Request object from its dictionary representation.
Actor.log.debug(f'[{call_id}]: Restoring the Scrapy Request from the apify_request...')

scrapy_request_dict_encoded = apify_request['userData']['scrapy_request']
if not isinstance(scrapy_request_dict_encoded, str):
raise TypeError('scrapy_request_dict_encoded must be a string')

scrapy_request_dict = pickle.loads(codecs.decode(scrapy_request_dict_encoded.encode(), 'base64'))
if not isinstance(scrapy_request_dict, dict):
raise TypeError('scrapy_request_dict must be a dictionary')

scrapy_request = request_from_dict(scrapy_request_dict, spider=spider)
if not isinstance(scrapy_request, Request):
raise TypeError('scrapy_request must be an instance of the Request class')

Actor.log.debug(f'[{call_id}]: Scrapy Request successfully reconstructed (scrapy_request={scrapy_request})...')

# Update the meta field with the meta field from the apify_request
meta = scrapy_request.meta or {}
meta.update({'apify_request_id': apify_request['id'], 'apify_request_unique_key': apify_request['uniqueKey']})
scrapy_request._meta = meta # scrapy_request.meta is a property, so we have to set it like this

# If the apify_request comes directly from the Request Queue, typically start URLs
else:
Actor.log.debug(f'[{call_id}]: gonna create a new Scrapy Request (cannot be restored)')

scrapy_request = Request(
url=apify_request['url'],
method=apify_request['method'],
meta={
'apify_request_id': apify_request['id'],
'apify_request_unique_key': apify_request['uniqueKey'],
},
)

Actor.log.debug(f'[{call_id}]: an apify_request was converted to the scrapy_request={scrapy_request}')
return scrapy_request


def apply_apify_settings(*, settings: Settings | None = None, proxy_config: dict | None = None) -> Settings:
"""Integrates Apify configuration into a Scrapy project settings.
Expand Down
Empty file.

0 comments on commit 1c68f62

Please sign in to comment.