Skip to content

Commit

Permalink
Add apply_apify_settings to Scrapy subpackage (#178)
Browse files Browse the repository at this point in the history
  • Loading branch information
vdusek committed Jan 23, 2024
1 parent 6294a1c commit 72a37f1
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 1 deletion.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Expand Up @@ -2,7 +2,9 @@

## [1.5.3](../../releases/tag/v1.5.3) - Unreleased

...
### Added

- Add `apply_apify_settings` function to Scrapy subpackage

## [1.5.2](../../releases/tag/v1.5.2) - 2024-01-19

Expand Down
41 changes: 41 additions & 0 deletions src/apify/scrapy/utils.py
Expand Up @@ -8,6 +8,8 @@

try:
from scrapy import Request, Spider
from scrapy.settings import Settings # noqa: TCH002
from scrapy.utils.project import get_project_settings
from scrapy.utils.python import to_bytes
from scrapy.utils.request import request_from_dict
except ImportError as exc:
Expand Down Expand Up @@ -153,6 +155,45 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
return scrapy_request


def apply_apify_settings(*, settings: Settings | None = None, proxy_config: dict | None = None) -> Settings:
"""Integrates Apify configuration into a Scrapy project settings.
Note: The function directly modifies the passed `settings` object and also returns it.
Args:
settings: Scrapy project settings to be modified.
proxy_config: Proxy configuration to be stored in the settings.
Returns:
Scrapy project settings with custom configurations.
"""
if settings is None:
settings = get_project_settings()

# Use ApifyScheduler as the scheduler
settings['SCHEDULER'] = 'apify.scrapy.scheduler.ApifyScheduler'

# Add the ActorDatasetPushPipeline into the item pipelines, assigning it the highest integer (1000),
# ensuring it is executed as the final step in the pipeline sequence
settings['ITEM_PIPELINES']['apify.scrapy.pipelines.ActorDatasetPushPipeline'] = 1000

# Disable the default RobotsTxtMiddleware, Apify's custom scheduler already handles robots.txt
settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware'] = None

# Disable the default HttpProxyMiddleware and add ApifyHttpProxyMiddleware
settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware'] = None
settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyHttpProxyMiddleware'] = 950

# Disable the default RetryMiddleware and add ApifyRetryMiddleware with the highest integer (1000)
settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.retry.RetryMiddleware'] = None
settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyRetryMiddleware'] = 1000

# Store the proxy configuration
settings['APIFY_PROXY_SETTINGS'] = proxy_config

return settings


async def open_queue_with_custom_client() -> RequestQueue:
"""Open a Request Queue with custom Apify Client.
Expand Down
62 changes: 62 additions & 0 deletions tests/unit/scrapy/utils/test_apply_apify_settings.py
@@ -0,0 +1,62 @@
from __future__ import annotations

from scrapy.settings import Settings

from apify.scrapy.utils import apply_apify_settings


def test__apply_apify_settings__overrides_scheduler() -> None:
settings = Settings()
new_settings = apply_apify_settings(settings=settings)

assert new_settings.get('SCHEDULER') == 'apify.scrapy.scheduler.ApifyScheduler'


def test__apply_apify_settings__update_item_pipelines() -> None:
settings = Settings(
{
'ITEM_PIPELINES': {
'scrapy.pipelines.files.FilesPipeline': 1,
}
}
)
new_settings = apply_apify_settings(settings=settings)

assert new_settings.get('ITEM_PIPELINES') == {
'scrapy.pipelines.files.FilesPipeline': 1,
'apify.scrapy.pipelines.ActorDatasetPushPipeline': 1000,
}


def test__apply_apify_settings__update_downloader_middlewares() -> None:
settings = Settings(
{
'DOWNLOADER_MIDDLEWARES': {
'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 123,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 234,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 345,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 543,
},
}
)
new_settings = apply_apify_settings(settings=settings)

assert new_settings.get('DOWNLOADER_MIDDLEWARES') == {
'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': None,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
'apify.scrapy.middlewares.ApifyHttpProxyMiddleware': 950,
'apify.scrapy.middlewares.ApifyRetryMiddleware': 1000,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 543,
}


def test__apply_apify_settings__add_proxy_config() -> None:
settings = Settings()
new_settings = apply_apify_settings(settings=settings)
assert new_settings.get('APIFY_PROXY_SETTINGS') is None

settings = Settings()
proxy_config = {'useApifyProxy': True, 'apifyProxyGroups': []}
new_settings = apply_apify_settings(settings=settings, proxy_config=proxy_config)
assert new_settings.get('APIFY_PROXY_SETTINGS') == {'useApifyProxy': True, 'apifyProxyGroups': []}

0 comments on commit 72a37f1

Please sign in to comment.