Skip to content

Commit

Permalink
Update of Scrapy integration, fixing dw middlewares (#186)
Browse files Browse the repository at this point in the history
  • Loading branch information
vdusek committed Feb 23, 2024
1 parent ea6c7f7 commit 1ca82f8
Show file tree
Hide file tree
Showing 8 changed files with 47 additions and 142 deletions.
10 changes: 8 additions & 2 deletions CHANGELOG.md
@@ -1,8 +1,14 @@
# Changelog

## [1.5.6](../../releases/tag/v1.5.6) - Unreleased
## [1.6.0](../../releases/tag/v1.6.0) - Unreleased

...
### Fixed

- Update of Scrapy integration, fixes in `ApifyScheduler`, `to_apify_request` and `apply_apify_settings`.

### Removed

- Removed `ApifyRetryMiddleware` and stay with the Scrapy's default one

## [1.5.5](../../releases/tag/v1.5.5) - 2024-02-01

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "apify"
version = "1.5.6"
version = "1.6.0"
description = "Apify SDK for Python"
readme = "README.md"
license = { text = "Apache Software License" }
Expand Down
1 change: 0 additions & 1 deletion src/apify/scrapy/middlewares/__init__.py
@@ -1,2 +1 @@
from .apify_proxy import ApifyHttpProxyMiddleware
from .apify_retry import ApifyRetryMiddleware
117 changes: 0 additions & 117 deletions src/apify/scrapy/middlewares/apify_retry.py

This file was deleted.

23 changes: 17 additions & 6 deletions src/apify/scrapy/requests.py
Expand Up @@ -16,6 +16,14 @@
from apify.actor import Actor


def _is_request_produced_by_middleware(scrapy_request: Request) -> bool:
"""Returns True if the Scrapy request was produced by a downloader middleware, otherwise False.
Works for RetryMiddleware and RedirectMiddleware.
"""
return bool(scrapy_request.meta.get('redirect_times')) or bool(scrapy_request.meta.get('retry_times'))


def to_apify_request(scrapy_request: Request, spider: Spider) -> dict:
"""Convert a Scrapy request to an Apify request.
Expand Down Expand Up @@ -48,13 +56,16 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict:
f'scrapy_request.headers is not an instance of the scrapy.http.headers.Headers class, scrapy_request.headers = {scrapy_request.headers}',
)

# Add 'id' to the apify_request
if scrapy_request.meta.get('apify_request_id'):
apify_request['id'] = scrapy_request.meta['apify_request_id']
if _is_request_produced_by_middleware(scrapy_request):
apify_request['uniqueKey'] = scrapy_request.url
else:
# Add 'id' to the apify_request
if scrapy_request.meta.get('apify_request_id'):
apify_request['id'] = scrapy_request.meta['apify_request_id']

# Add 'uniqueKey' to the apify_request
if scrapy_request.meta.get('apify_request_unique_key'):
apify_request['uniqueKey'] = scrapy_request.meta['apify_request_unique_key']
# Add 'uniqueKey' to the apify_request
if scrapy_request.meta.get('apify_request_unique_key'):
apify_request['uniqueKey'] = scrapy_request.meta['apify_request_unique_key']

# Serialize the Scrapy Request and store it in the apify_request.
# - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64,
Expand Down
13 changes: 12 additions & 1 deletion src/apify/scrapy/scheduler.py
Expand Up @@ -70,6 +70,8 @@ def has_pending_requests(self: ApifyScheduler) -> bool:
def enqueue_request(self: ApifyScheduler, request: Request) -> bool:
"""Add a request to the scheduler.
This could be called from either from a spider or a downloader middleware (e.g. redirect, retry, ...).
Args:
request: The request to add to the scheduler.
Expand All @@ -94,7 +96,7 @@ def enqueue_request(self: ApifyScheduler, request: Request) -> bool:
traceback.print_exc()
raise

Actor.log.debug(f'[{call_id}]: apify_request was added to the RQ (apify_request={apify_request})')
Actor.log.debug(f'[{call_id}]: rq.add_request.result={result}...')
return bool(result['wasAlreadyPresent'])

def next_request(self: ApifyScheduler) -> Request | None:
Expand All @@ -109,6 +111,7 @@ def next_request(self: ApifyScheduler) -> Request | None:
if not isinstance(self._rq, RequestQueue):
raise TypeError('self._rq must be an instance of the RequestQueue class')

# Fetch the next request from the Request Queue
try:
apify_request = nested_event_loop.run_until_complete(self._rq.fetch_next_request())
except BaseException:
Expand All @@ -123,6 +126,14 @@ def next_request(self: ApifyScheduler) -> Request | None:
if not isinstance(self.spider, Spider):
raise TypeError('self.spider must be an instance of the Spider class')

# Let the Request Queue know that the request is being handled. Every request should be marked as handled,
# retrying is handled by the Scrapy's RetryMiddleware.
try:
nested_event_loop.run_until_complete(self._rq.mark_request_as_handled(apify_request))
except BaseException:
traceback.print_exc()
raise

scrapy_request = to_scrapy_request(apify_request, spider=self.spider)
Actor.log.debug(
f'[{call_id}]: apify_request was transformed to the scrapy_request which is gonna be returned (scrapy_request={scrapy_request})',
Expand Down
13 changes: 5 additions & 8 deletions src/apify/scrapy/utils.py
Expand Up @@ -59,16 +59,13 @@ def apply_apify_settings(*, settings: Settings | None = None, proxy_config: dict
# ensuring it is executed as the final step in the pipeline sequence
settings['ITEM_PIPELINES']['apify.scrapy.pipelines.ActorDatasetPushPipeline'] = 1000

# Disable the default RobotsTxtMiddleware, Apify's custom scheduler already handles robots.txt
settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware'] = None
# Disable the default AjaxCrawlMiddleware since it can be problematic with Apify. It can return a new request
# during process_response, but currently we have no way of detecting it and handling it properly.
settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware'] = None

# Disable the default HttpProxyMiddleware and add ApifyHttpProxyMiddleware
# Replace the default HttpProxyMiddleware with ApifyHttpProxyMiddleware
settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware'] = None
settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyHttpProxyMiddleware'] = 950

# Disable the default RetryMiddleware and add ApifyRetryMiddleware with the highest integer (1000)
settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.retry.RetryMiddleware'] = None
settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyRetryMiddleware'] = 1000
settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyHttpProxyMiddleware'] = 750

# Store the proxy configuration
settings['APIFY_PROXY_SETTINGS'] = proxy_config
Expand Down
10 changes: 4 additions & 6 deletions tests/unit/scrapy/utils/test_apply_apify_settings.py
Expand Up @@ -34,20 +34,18 @@ def test__apply_apify_settings__update_downloader_middlewares() -> None:
'DOWNLOADER_MIDDLEWARES': {
'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 123,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 234,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 345,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 543,
},
}
)
new_settings = apply_apify_settings(settings=settings)

assert new_settings.get('DOWNLOADER_MIDDLEWARES') == {
'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': None,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
'apify.scrapy.middlewares.ApifyHttpProxyMiddleware': 950,
'apify.scrapy.middlewares.ApifyRetryMiddleware': 1000,
'apify.scrapy.middlewares.ApifyHttpProxyMiddleware': 750,
'scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware': None,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 543,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None,
'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 123,
}


Expand Down

0 comments on commit 1ca82f8

Please sign in to comment.