|
3 | 3 | from collections.abc import Iterator, MutableMapping
|
4 | 4 | from datetime import datetime
|
5 | 5 | from enum import IntEnum
|
6 |
| -from typing import TYPE_CHECKING, Annotated, Any, cast |
| 6 | +from typing import TYPE_CHECKING, Annotated, Any, TypedDict, cast |
7 | 7 |
|
8 | 8 | from pydantic import BaseModel, BeforeValidator, ConfigDict, Field, PlainSerializer, PlainValidator, TypeAdapter
|
9 | 9 | from yarl import URL
|
|
15 | 15 | from crawlee._utils.urls import validate_http_url
|
16 | 16 |
|
17 | 17 | if TYPE_CHECKING:
|
18 |
| - from typing_extensions import Self |
| 18 | + from typing_extensions import NotRequired, Required, Self |
19 | 19 |
|
20 | 20 |
|
21 | 21 | class RequestState(IntEnum):
|
@@ -108,27 +108,57 @@ def __eq__(self, other: object) -> bool:
|
108 | 108 | user_data_adapter = TypeAdapter(UserData)
|
109 | 109 |
|
110 | 110 |
|
111 |
| -class BaseRequestData(BaseModel): |
112 |
| - """Data needed to create a new crawling request.""" |
| 111 | +class RequestOptions(TypedDict): |
| 112 | + """Options that can be used to customize request creation. |
113 | 113 |
|
114 |
| - model_config = ConfigDict(populate_by_name=True) |
| 114 | + This type exactly matches the parameters of `Request.from_url` method. |
| 115 | + """ |
115 | 116 |
|
116 |
| - url: Annotated[str, BeforeValidator(validate_http_url), Field()] |
117 |
| - """The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters |
118 |
| - and fragments.""" |
| 117 | + url: Required[str] |
| 118 | + method: NotRequired[HttpMethod] |
| 119 | + headers: NotRequired[HttpHeaders | dict[str, str] | None] |
| 120 | + payload: NotRequired[HttpPayload | str | None] |
| 121 | + label: NotRequired[str | None] |
| 122 | + unique_key: NotRequired[str | None] |
| 123 | + id: NotRequired[str | None] |
| 124 | + keep_url_fragment: NotRequired[bool] |
| 125 | + use_extended_unique_key: NotRequired[bool] |
| 126 | + always_enqueue: NotRequired[bool] |
| 127 | + user_data: NotRequired[dict[str, JsonSerializable]] |
| 128 | + no_retry: NotRequired[bool] |
119 | 129 |
|
120 |
| - unique_key: Annotated[str, Field(alias='uniqueKey')] |
121 |
| - """A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing |
122 |
| - to the same URL. |
123 | 130 |
|
124 |
| - If `unique_key` is not provided, then it is automatically generated by normalizing the URL. |
125 |
| - For example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key` |
126 |
| - of `http://www.example.com/something`. |
| 131 | +@docs_group('Data structures') |
| 132 | +class Request(BaseModel): |
| 133 | + """Represents a request in the Crawlee framework, containing the necessary information for crawling operations. |
127 | 134 |
|
128 |
| - Pass an arbitrary non-empty text value to the `unique_key` property |
129 |
| - to override the default behavior and specify which URLs shall be considered equal. |
| 135 | + The `Request` class is one of the core components in Crawlee, utilized by various components such as request |
| 136 | + providers, HTTP clients, crawlers, and more. It encapsulates the essential data for executing web requests, |
| 137 | + including the URL, HTTP method, headers, payload, and user data. The user data allows custom information |
| 138 | + to be stored and persisted throughout the request lifecycle, including its retries. |
| 139 | +
|
| 140 | + Key functionalities include managing the request's identifier (`id`), unique key (`unique_key`) that is used |
| 141 | + for request deduplication, controlling retries, handling state management, and enabling configuration for session |
| 142 | + rotation and proxy handling. |
| 143 | +
|
| 144 | + The recommended way to create a new instance is by using the `Request.from_url` constructor, which automatically |
| 145 | + generates a unique key and identifier based on the URL and request parameters. |
| 146 | +
|
| 147 | + ### Usage |
| 148 | +
|
| 149 | + ```python |
| 150 | + from crawlee import Request |
| 151 | +
|
| 152 | + request = Request.from_url('https://crawlee.dev') |
| 153 | + ``` |
130 | 154 | """
|
131 | 155 |
|
| 156 | + model_config = ConfigDict(populate_by_name=True) |
| 157 | + |
| 158 | + url: Annotated[str, BeforeValidator(validate_http_url), Field()] |
| 159 | + """The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters |
| 160 | + and fragments.""" |
| 161 | + |
132 | 162 | method: HttpMethod = 'GET'
|
133 | 163 | """HTTP request method."""
|
134 | 164 |
|
@@ -172,79 +202,16 @@ class BaseRequestData(BaseModel):
|
172 | 202 | handled_at: Annotated[datetime | None, Field(alias='handledAt')] = None
|
173 | 203 | """Timestamp when the request was handled."""
|
174 | 204 |
|
175 |
| - @classmethod |
176 |
| - def from_url( |
177 |
| - cls, |
178 |
| - url: str, |
179 |
| - *, |
180 |
| - method: HttpMethod = 'GET', |
181 |
| - headers: HttpHeaders | dict[str, str] | None = None, |
182 |
| - payload: HttpPayload | str | None = None, |
183 |
| - label: str | None = None, |
184 |
| - unique_key: str | None = None, |
185 |
| - keep_url_fragment: bool = False, |
186 |
| - use_extended_unique_key: bool = False, |
187 |
| - **kwargs: Any, |
188 |
| - ) -> Self: |
189 |
| - """Create a new `BaseRequestData` instance from a URL. See `Request.from_url` for more details.""" |
190 |
| - if isinstance(headers, dict) or headers is None: |
191 |
| - headers = HttpHeaders(headers or {}) |
192 |
| - |
193 |
| - if isinstance(payload, str): |
194 |
| - payload = payload.encode() |
195 |
| - |
196 |
| - unique_key = unique_key or compute_unique_key( |
197 |
| - url, |
198 |
| - method=method, |
199 |
| - headers=headers, |
200 |
| - payload=payload, |
201 |
| - keep_url_fragment=keep_url_fragment, |
202 |
| - use_extended_unique_key=use_extended_unique_key, |
203 |
| - ) |
204 |
| - |
205 |
| - request = cls( |
206 |
| - url=url, |
207 |
| - unique_key=unique_key, |
208 |
| - method=method, |
209 |
| - headers=headers, |
210 |
| - payload=payload, |
211 |
| - **kwargs, |
212 |
| - ) |
213 |
| - |
214 |
| - if label is not None: |
215 |
| - request.user_data['label'] = label |
216 |
| - |
217 |
| - return request |
218 |
| - |
219 |
| - def get_query_param_from_url(self, param: str, *, default: str | None = None) -> str | None: |
220 |
| - """Get the value of a specific query parameter from the URL.""" |
221 |
| - query_params = URL(self.url).query |
222 |
| - return query_params.get(param, default) |
223 |
| - |
224 |
| - |
225 |
| -@docs_group('Data structures') |
226 |
| -class Request(BaseRequestData): |
227 |
| - """Represents a request in the Crawlee framework, containing the necessary information for crawling operations. |
228 |
| -
|
229 |
| - The `Request` class is one of the core components in Crawlee, utilized by various components such as request |
230 |
| - providers, HTTP clients, crawlers, and more. It encapsulates the essential data for executing web requests, |
231 |
| - including the URL, HTTP method, headers, payload, and user data. The user data allows custom information |
232 |
| - to be stored and persisted throughout the request lifecycle, including its retries. |
233 |
| -
|
234 |
| - Key functionalities include managing the request's identifier (`id`), unique key (`unique_key`) that is used |
235 |
| - for request deduplication, controlling retries, handling state management, and enabling configuration for session |
236 |
| - rotation and proxy handling. |
237 |
| -
|
238 |
| - The recommended way to create a new instance is by using the `Request.from_url` constructor, which automatically |
239 |
| - generates a unique key and identifier based on the URL and request parameters. |
240 |
| -
|
241 |
| - ### Usage |
| 205 | + unique_key: Annotated[str, Field(alias='uniqueKey')] |
| 206 | + """A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing |
| 207 | + to the same URL. |
242 | 208 |
|
243 |
| - ```python |
244 |
| - from crawlee import Request |
| 209 | + If `unique_key` is not provided, then it is automatically generated by normalizing the URL. |
| 210 | + For example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key` |
| 211 | + of `http://www.example.com/something`. |
245 | 212 |
|
246 |
| - request = Request.from_url('https://crawlee.dev') |
247 |
| - ``` |
| 213 | + Pass an arbitrary non-empty text value to the `unique_key` property |
| 214 | + to override the default behavior and specify which URLs shall be considered equal. |
248 | 215 | """
|
249 | 216 |
|
250 | 217 | id: str
|
@@ -331,12 +298,10 @@ def from_url(
|
331 | 298 |
|
332 | 299 | return request
|
333 | 300 |
|
334 |
| - @classmethod |
335 |
| - def from_base_request_data(cls, base_request_data: BaseRequestData, *, id: str | None = None) -> Self: |
336 |
| - """Create a complete Request object based on a BaseRequestData instance.""" |
337 |
| - kwargs = base_request_data.model_dump() |
338 |
| - kwargs['id'] = id or unique_key_to_request_id(base_request_data.unique_key) |
339 |
| - return cls(**kwargs) |
| 301 | + def get_query_param_from_url(self, param: str, *, default: str | None = None) -> str | None: |
| 302 | + """Get the value of a specific query parameter from the URL.""" |
| 303 | + query_params = URL(self.url).query |
| 304 | + return query_params.get(param, default) |
340 | 305 |
|
341 | 306 | @property
|
342 | 307 | def label(self) -> str | None:
|
|
0 commit comments