Skip to content

Commit 6b15957

Browse files
Mantisusvdusekjanbuchar
authored
feat: add transform_request_function for enqueue_links (#923)
### Description - Add `transform_request_function` for `enqueue_links` - Add example in introduction section - Add tests ### Issues - Closes: #894 --------- Co-authored-by: Vlada Dusek <[email protected]> Co-authored-by: Jan Buchar <[email protected]>
1 parent edb9245 commit 6b15957

File tree

12 files changed

+276
-143
lines changed

12 files changed

+276
-143
lines changed

docs/introduction/03_adding_more_urls.mdx

+7-4
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import OriginalCodeExample from '!!raw-loader!./code/03_original_code.py';
1010
import FindingNewLinksExample from '!!raw-loader!./code/03_finding_new_links.py';
1111
import EnqueueStrategyExample from '!!raw-loader!./code/03_enqueue_strategy.py';
1212
import GlobsExample from '!!raw-loader!./code/03_globs.py';
13+
import TransformExample from '!!raw-loader!./code/03_transform_request.py';
1314

1415
Previously you've built a very simple crawler that downloads HTML of a single page, reads its title and prints it to the console. This is the original source code:
1516

@@ -106,11 +107,13 @@ For even more control, you can use the `include` or `exclude` parameters, either
106107
{GlobsExample}
107108
</CodeBlock>
108109

109-
{/* TODO:
110-
### Transform requests
110+
### Transform requests before enqueuing
111111

112-
...
113-
*/}
112+
For cases where you need to modify or filter requests before they are enqueued, you can use the `transform_request_function` parameter. This function takes a <ApiLink to="class/Request">`Request`</ApiLink> object as input and should return either a modified <ApiLink to="class/Request">`Request`</ApiLink>` object or `None`. If the function returns `None`, the request will be skipped.
113+
114+
<CodeBlock className="language-python">
115+
{TransformExample}
116+
</CodeBlock>
114117

115118
## Next steps
116119

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
from __future__ import annotations
2+
3+
from crawlee import HttpHeaders, RequestOptions, RequestTransformAction
4+
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
5+
6+
7+
def transform_request(request_options: RequestOptions) -> RequestOptions | RequestTransformAction:
8+
# Skip requests to PDF files
9+
if request_options['url'].endswith('.pdf'):
10+
return 'skip'
11+
12+
if '/docs' in request_options['url']:
13+
# Add custom headers to requests to specific URLs
14+
request_options['headers'] = HttpHeaders({'Custom-Header': 'value'})
15+
16+
elif '/blog' in request_options['url']:
17+
# Add label for certain URLs
18+
request_options['label'] = 'BLOG'
19+
20+
else:
21+
# Signal that the request should proceed without any transformation
22+
return 'unchanged'
23+
24+
return request_options
25+
26+
27+
async def main() -> None:
28+
crawler = BeautifulSoupCrawler(max_requests_per_crawl=50)
29+
30+
@crawler.router.default_handler
31+
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
32+
context.log.info(f'Processing {context.request.url}.')
33+
34+
# Transform request before enqueueing
35+
await context.enqueue_links(transform_request_function=transform_request)
36+
37+
@crawler.router.handler('BLOG')
38+
async def blog_handler(context: BeautifulSoupCrawlingContext) -> None:
39+
context.log.info(f'Blog Processing {context.request.url}.')
40+
41+
await crawler.run(['https://crawlee.dev/'])

src/crawlee/__init__.py

+12-3
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,19 @@
11
from importlib import metadata
22

3-
from ._request import Request
3+
from ._request import Request, RequestOptions
44
from ._service_locator import service_locator
5-
from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders
5+
from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction
66
from ._utils.globs import Glob
77

88
__version__ = metadata.version('crawlee')
99

10-
__all__ = ['ConcurrencySettings', 'EnqueueStrategy', 'Glob', 'HttpHeaders', 'Request', 'service_locator']
10+
__all__ = [
11+
'ConcurrencySettings',
12+
'EnqueueStrategy',
13+
'Glob',
14+
'HttpHeaders',
15+
'Request',
16+
'RequestOptions',
17+
'RequestTransformAction',
18+
'service_locator',
19+
]

src/crawlee/_request.py

+58-93
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from collections.abc import Iterator, MutableMapping
44
from datetime import datetime
55
from enum import IntEnum
6-
from typing import TYPE_CHECKING, Annotated, Any, cast
6+
from typing import TYPE_CHECKING, Annotated, Any, TypedDict, cast
77

88
from pydantic import BaseModel, BeforeValidator, ConfigDict, Field, PlainSerializer, PlainValidator, TypeAdapter
99
from yarl import URL
@@ -15,7 +15,7 @@
1515
from crawlee._utils.urls import validate_http_url
1616

1717
if TYPE_CHECKING:
18-
from typing_extensions import Self
18+
from typing_extensions import NotRequired, Required, Self
1919

2020

2121
class RequestState(IntEnum):
@@ -108,27 +108,57 @@ def __eq__(self, other: object) -> bool:
108108
user_data_adapter = TypeAdapter(UserData)
109109

110110

111-
class BaseRequestData(BaseModel):
112-
"""Data needed to create a new crawling request."""
111+
class RequestOptions(TypedDict):
112+
"""Options that can be used to customize request creation.
113113
114-
model_config = ConfigDict(populate_by_name=True)
114+
This type exactly matches the parameters of `Request.from_url` method.
115+
"""
115116

116-
url: Annotated[str, BeforeValidator(validate_http_url), Field()]
117-
"""The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters
118-
and fragments."""
117+
url: Required[str]
118+
method: NotRequired[HttpMethod]
119+
headers: NotRequired[HttpHeaders | dict[str, str] | None]
120+
payload: NotRequired[HttpPayload | str | None]
121+
label: NotRequired[str | None]
122+
unique_key: NotRequired[str | None]
123+
id: NotRequired[str | None]
124+
keep_url_fragment: NotRequired[bool]
125+
use_extended_unique_key: NotRequired[bool]
126+
always_enqueue: NotRequired[bool]
127+
user_data: NotRequired[dict[str, JsonSerializable]]
128+
no_retry: NotRequired[bool]
119129

120-
unique_key: Annotated[str, Field(alias='uniqueKey')]
121-
"""A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
122-
to the same URL.
123130

124-
If `unique_key` is not provided, then it is automatically generated by normalizing the URL.
125-
For example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key`
126-
of `http://www.example.com/something`.
131+
@docs_group('Data structures')
132+
class Request(BaseModel):
133+
"""Represents a request in the Crawlee framework, containing the necessary information for crawling operations.
127134
128-
Pass an arbitrary non-empty text value to the `unique_key` property
129-
to override the default behavior and specify which URLs shall be considered equal.
135+
The `Request` class is one of the core components in Crawlee, utilized by various components such as request
136+
providers, HTTP clients, crawlers, and more. It encapsulates the essential data for executing web requests,
137+
including the URL, HTTP method, headers, payload, and user data. The user data allows custom information
138+
to be stored and persisted throughout the request lifecycle, including its retries.
139+
140+
Key functionalities include managing the request's identifier (`id`), unique key (`unique_key`) that is used
141+
for request deduplication, controlling retries, handling state management, and enabling configuration for session
142+
rotation and proxy handling.
143+
144+
The recommended way to create a new instance is by using the `Request.from_url` constructor, which automatically
145+
generates a unique key and identifier based on the URL and request parameters.
146+
147+
### Usage
148+
149+
```python
150+
from crawlee import Request
151+
152+
request = Request.from_url('https://crawlee.dev')
153+
```
130154
"""
131155

156+
model_config = ConfigDict(populate_by_name=True)
157+
158+
url: Annotated[str, BeforeValidator(validate_http_url), Field()]
159+
"""The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters
160+
and fragments."""
161+
132162
method: HttpMethod = 'GET'
133163
"""HTTP request method."""
134164

@@ -172,79 +202,16 @@ class BaseRequestData(BaseModel):
172202
handled_at: Annotated[datetime | None, Field(alias='handledAt')] = None
173203
"""Timestamp when the request was handled."""
174204

175-
@classmethod
176-
def from_url(
177-
cls,
178-
url: str,
179-
*,
180-
method: HttpMethod = 'GET',
181-
headers: HttpHeaders | dict[str, str] | None = None,
182-
payload: HttpPayload | str | None = None,
183-
label: str | None = None,
184-
unique_key: str | None = None,
185-
keep_url_fragment: bool = False,
186-
use_extended_unique_key: bool = False,
187-
**kwargs: Any,
188-
) -> Self:
189-
"""Create a new `BaseRequestData` instance from a URL. See `Request.from_url` for more details."""
190-
if isinstance(headers, dict) or headers is None:
191-
headers = HttpHeaders(headers or {})
192-
193-
if isinstance(payload, str):
194-
payload = payload.encode()
195-
196-
unique_key = unique_key or compute_unique_key(
197-
url,
198-
method=method,
199-
headers=headers,
200-
payload=payload,
201-
keep_url_fragment=keep_url_fragment,
202-
use_extended_unique_key=use_extended_unique_key,
203-
)
204-
205-
request = cls(
206-
url=url,
207-
unique_key=unique_key,
208-
method=method,
209-
headers=headers,
210-
payload=payload,
211-
**kwargs,
212-
)
213-
214-
if label is not None:
215-
request.user_data['label'] = label
216-
217-
return request
218-
219-
def get_query_param_from_url(self, param: str, *, default: str | None = None) -> str | None:
220-
"""Get the value of a specific query parameter from the URL."""
221-
query_params = URL(self.url).query
222-
return query_params.get(param, default)
223-
224-
225-
@docs_group('Data structures')
226-
class Request(BaseRequestData):
227-
"""Represents a request in the Crawlee framework, containing the necessary information for crawling operations.
228-
229-
The `Request` class is one of the core components in Crawlee, utilized by various components such as request
230-
providers, HTTP clients, crawlers, and more. It encapsulates the essential data for executing web requests,
231-
including the URL, HTTP method, headers, payload, and user data. The user data allows custom information
232-
to be stored and persisted throughout the request lifecycle, including its retries.
233-
234-
Key functionalities include managing the request's identifier (`id`), unique key (`unique_key`) that is used
235-
for request deduplication, controlling retries, handling state management, and enabling configuration for session
236-
rotation and proxy handling.
237-
238-
The recommended way to create a new instance is by using the `Request.from_url` constructor, which automatically
239-
generates a unique key and identifier based on the URL and request parameters.
240-
241-
### Usage
205+
unique_key: Annotated[str, Field(alias='uniqueKey')]
206+
"""A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
207+
to the same URL.
242208
243-
```python
244-
from crawlee import Request
209+
If `unique_key` is not provided, then it is automatically generated by normalizing the URL.
210+
For example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key`
211+
of `http://www.example.com/something`.
245212
246-
request = Request.from_url('https://crawlee.dev')
247-
```
213+
Pass an arbitrary non-empty text value to the `unique_key` property
214+
to override the default behavior and specify which URLs shall be considered equal.
248215
"""
249216

250217
id: str
@@ -331,12 +298,10 @@ def from_url(
331298

332299
return request
333300

334-
@classmethod
335-
def from_base_request_data(cls, base_request_data: BaseRequestData, *, id: str | None = None) -> Self:
336-
"""Create a complete Request object based on a BaseRequestData instance."""
337-
kwargs = base_request_data.model_dump()
338-
kwargs['id'] = id or unique_key_to_request_id(base_request_data.unique_key)
339-
return cls(**kwargs)
301+
def get_query_param_from_url(self, param: str, *, default: str | None = None) -> str | None:
302+
"""Get the value of a specific query parameter from the URL."""
303+
query_params = URL(self.url).query
304+
return query_params.get(param, default)
340305

341306
@property
342307
def label(self) -> str | None:

src/crawlee/_types.py

+12-5
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from collections.abc import Iterator, Mapping
44
from dataclasses import dataclass
55
from enum import Enum
6-
from typing import TYPE_CHECKING, Annotated, Any, Literal, Optional, Protocol, TypeVar, Union, cast, overload
6+
from typing import TYPE_CHECKING, Annotated, Any, Callable, Literal, Optional, Protocol, TypeVar, Union, cast, overload
77

88
from pydantic import ConfigDict, Field, PlainValidator, RootModel
99
from typing_extensions import NotRequired, TypeAlias, TypedDict, Unpack
@@ -16,7 +16,7 @@
1616
from collections.abc import Coroutine, Sequence
1717

1818
from crawlee import Glob, Request
19-
from crawlee._request import BaseRequestData
19+
from crawlee._request import RequestOptions
2020
from crawlee.http_clients import HttpResponse
2121
from crawlee.proxy_configuration import ProxyInfo
2222
from crawlee.sessions import Session
@@ -44,6 +44,8 @@
4444

4545
HttpPayload: TypeAlias = bytes
4646

47+
RequestTransformAction: TypeAlias = Literal['skip', 'unchanged']
48+
4749

4850
def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]:
4951
"""Converts all header keys to lowercase, strips whitespace, and returns them sorted by key."""
@@ -182,7 +184,7 @@ class EnqueueLinksKwargs(TypedDict):
182184
class AddRequestsKwargs(EnqueueLinksKwargs):
183185
"""Keyword arguments for the `add_requests` methods."""
184186

185-
requests: Sequence[str | BaseRequestData | Request]
187+
requests: Sequence[str | Request]
186188
"""Requests to be added to the `RequestManager`."""
187189

188190

@@ -264,7 +266,7 @@ def __init__(self, *, key_value_store_getter: GetKeyValueStoreFunction) -> None:
264266

265267
async def add_requests(
266268
self,
267-
requests: Sequence[str | BaseRequestData],
269+
requests: Sequence[str | Request],
268270
**kwargs: Unpack[EnqueueLinksKwargs],
269271
) -> None:
270272
"""Track a call to the `add_requests` context helper."""
@@ -315,7 +317,7 @@ class AddRequestsFunction(Protocol):
315317

316318
def __call__(
317319
self,
318-
requests: Sequence[str | BaseRequestData | Request],
320+
requests: Sequence[str | Request],
319321
**kwargs: Unpack[EnqueueLinksKwargs],
320322
) -> Coroutine[None, None, None]:
321323
"""Call dunder method.
@@ -341,6 +343,7 @@ def __call__(
341343
selector: str = 'a',
342344
label: str | None = None,
343345
user_data: dict[str, Any] | None = None,
346+
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
344347
**kwargs: Unpack[EnqueueLinksKwargs],
345348
) -> Coroutine[None, None, None]:
346349
"""A call dunder method.
@@ -353,6 +356,10 @@ def __call__(
353356
- `BeautifulSoupCrawler` supports CSS selectors.
354357
label: Label for the newly created `Request` objects, used for request routing.
355358
user_data: User data to be provided to the newly created `Request` objects.
359+
transform_request_function: A function that takes `RequestOptions` and returns either:
360+
- Modified `RequestOptions` to update the request configuration,
361+
- `'skip'` to exclude the request from being enqueued,
362+
- `'unchanged'` to use the original request options without modification.
356363
**kwargs: Additional keyword arguments.
357364
"""
358365

0 commit comments

Comments
 (0)