Skip to content

Commit 29b166e

Browse files
authored
Add sessions subpackage (#108)
1 parent d33d1c5 commit 29b166e

14 files changed

+1004
-54
lines changed

pyproject.toml

+2
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ pydantic-settings = "^2.2.1"
5858
pyee = "^11.1.0"
5959
sortedcollections = "^2.1.0"
6060
typing-extensions = "^4.1.0"
61+
python-dateutil = "^2.9.0"
6162

6263
[tool.poetry.group.dev.dependencies]
6364
build = "~1.2.0"
@@ -77,6 +78,7 @@ twine = "~5.0.0"
7778
types-aiofiles = "^23.2.0.20240106"
7879
types-colorama = "~0.4.15.20240106"
7980
types-psutil = "~5.9.5.20240205"
81+
types-python-dateutil = "^2.9.0.20240316"
8082

8183
[tool.ruff]
8284
line-length = 120

src/crawlee/autoscaling/snapshotter.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -94,11 +94,7 @@ def __init__(
9494
self._reserve_memory_ratio = reserve_memory_ratio
9595
self._memory_warning_cooldown_period = memory_warning_cooldown_period
9696
self._client_rate_limit_error_retry_count = client_rate_limit_error_retry_count
97-
98-
if max_memory_size is None:
99-
self._max_memory_size = self._get_default_max_memory_size()
100-
else:
101-
self._max_memory_size = max_memory_size
97+
self._max_memory_size = max_memory_size or self._get_default_max_memory_size()
10298

10399
self._cpu_snapshots: list[CpuSnapshot] = []
104100
self._event_loop_snapshots: list[EventLoopSnapshot] = []

src/crawlee/events/event_manager.py

+28-24
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
if TYPE_CHECKING:
1515
from datetime import timedelta
16+
from types import TracebackType
1617

1718
from crawlee.events.types import Event, EventData, Listener, WrappedListener
1819

@@ -26,8 +27,13 @@ class EventManager:
2627
their execution. It is built on top of the `pyee.asyncio.AsyncIOEventEmitter` class.
2728
"""
2829

29-
def __init__(self) -> None:
30-
logger.debug('Calling EventManager.__init__()...')
30+
def __init__(self, close_timeout: timedelta | None = None) -> None:
31+
"""Create a new instance.
32+
33+
Args:
34+
close_timeout: Optional timeout after which the pending event listeners are canceled.
35+
"""
36+
self._close_timeout = close_timeout
3137

3238
# Asynchronous event emitter for handle events and invoke the event listeners.
3339
self._event_emitter = AsyncIOEventEmitter()
@@ -41,19 +47,35 @@ def __init__(self) -> None:
4147
lambda: defaultdict(list),
4248
)
4349

50+
async def __aenter__(self) -> EventManager:
51+
"""Initializes the event manager upon entering the async context."""
52+
return self
53+
54+
async def __aexit__(
55+
self,
56+
exc_type: type[BaseException] | None,
57+
exc_value: BaseException | None,
58+
exc_traceback: TracebackType | None,
59+
) -> None:
60+
"""Closes the local event manager upon exiting the async context.
61+
62+
This will stop listening for the events, and it will wait for all the event listeners to finish.
63+
"""
64+
await self.wait_for_all_listeners_to_complete(timeout=self._close_timeout)
65+
self._event_emitter.remove_all_listeners()
66+
self._listener_tasks.clear()
67+
self._listeners_to_wrappers.clear()
68+
4469
def on(self, *, event: Event, listener: Listener) -> None:
4570
"""Add an event listener to the event manager.
4671
4772
Args:
4873
event: The Actor event for which to listen to.
4974
listener: The function (sync or async) which is to be called when the event is emitted.
5075
"""
51-
logger.debug('Calling EventManager.on()...')
5276

5377
@wraps(listener)
5478
async def listener_wrapper(event_data: EventData) -> None:
55-
logger.debug('Calling EventManager.on.listener_wrapper()...')
56-
5779
# If the listener is a coroutine function, just call it, otherwise, run it in a separate thread
5880
# to avoid blocking the event loop
5981
coro = (
@@ -92,8 +114,6 @@ def off(self, *, event: Event, listener: Listener | None = None) -> None:
92114
listener: The listener which is supposed to be removed. If not passed, all listeners of this event
93115
are removed.
94116
"""
95-
logger.debug('Calling EventManager.off()...')
96-
97117
if listener:
98118
for listener_wrapper in self._listeners_to_wrappers[event][listener]:
99119
self._event_emitter.remove_listener(event.value, listener_wrapper)
@@ -109,31 +129,15 @@ def emit(self, *, event: Event, event_data: EventData) -> None:
109129
event: The event which will be emitted.
110130
event_data: The data which will be passed to the event listeners.
111131
"""
112-
logger.debug('Calling EventManager.emit()...')
113132
self._event_emitter.emit(event.value, event_data)
114133

115-
async def close(self, *, timeout: timedelta | None = None) -> None:
116-
"""Close the event manager.
117-
118-
This will stop listening for the events, and it will wait for all the event listeners to finish.
119-
120-
Args:
121-
timeout: Optional timeout after which the pending event listeners are canceled.
122-
"""
123-
logger.debug('Calling EventManager.close()...')
124-
await self._wait_for_all_listeners_to_complete(timeout=timeout)
125-
self._event_emitter.remove_all_listeners()
126-
self._listener_tasks.clear()
127-
self._listeners_to_wrappers.clear()
128-
129-
async def _wait_for_all_listeners_to_complete(self, *, timeout: timedelta | None = None) -> None:
134+
async def wait_for_all_listeners_to_complete(self, *, timeout: timedelta | None = None) -> None:
130135
"""Wait for all currently executing event listeners to complete.
131136
132137
Args:
133138
timeout: The maximum time to wait for the event listeners to finish. If they do not complete within
134139
the specified timeout, they will be canceled.
135140
"""
136-
logger.debug('Calling EventManager.wait_for_all_listeners_to_complete()...')
137141

138142
async def wait_for_listeners() -> None:
139143
"""Gathers all listener tasks and awaits their completion, logging any exceptions encountered."""

src/crawlee/events/local_event_manager.py

+4-10
Original file line numberDiff line numberDiff line change
@@ -25,18 +25,14 @@ def __init__(
2525
self,
2626
*,
2727
system_info_interval: timedelta = timedelta(seconds=60),
28-
close_timeout: timedelta | None = None,
2928
) -> None:
3029
"""Create a new instance.
3130
3231
Args:
3332
system_info_interval: Interval at which `SystemInfo` events are emitted.
3433
close_timeout: Optional timeout for closing the event manager.
3534
"""
36-
logger.debug('Calling LocalEventManager.__init__()...')
37-
3835
self._system_info_interval = system_info_interval
39-
self._close_timeout = close_timeout
4036

4137
# Recurring task for emitting system info events.
4238
self._emit_system_info_event_rec_task: RecurringTask | None = None
@@ -48,12 +44,14 @@ async def __aenter__(self) -> LocalEventManager:
4844
4945
It starts emitting system info events at regular intervals.
5046
"""
51-
logger.debug('Calling LocalEventManager.__aenter__()...')
47+
await super().__aenter__()
48+
5249
self._emit_system_info_event_rec_task = RecurringTask(
5350
func=self._emit_system_info_event,
5451
delay=self._system_info_interval,
5552
)
5653
self._emit_system_info_event_rec_task.start()
54+
5755
return self
5856

5957
async def __aexit__(
@@ -66,17 +64,13 @@ async def __aexit__(
6664
6765
It stops emitting system info events and closes the event manager.
6866
"""
69-
logger.debug('Calling LocalEventManager.__aexit__()...')
70-
7167
if self._emit_system_info_event_rec_task is not None:
7268
await self._emit_system_info_event_rec_task.stop()
7369

74-
await super().close(timeout=self._close_timeout)
70+
await super().__aexit__(exc_type, exc_value, exc_traceback)
7571

7672
async def _emit_system_info_event(self) -> None:
7773
"""Emits a system info event with the current CPU and memory usage."""
78-
logger.debug('Calling LocalEventManager._emit_system_info_event()...')
79-
8074
cpu_info = await asyncio.to_thread(get_cpu_info)
8175
memory_info = await asyncio.to_thread(get_memory_info)
8276

src/crawlee/events/types.py

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ class Event(Enum):
1717
MIGRATING = 'migrating'
1818
ABORTING = 'aborting'
1919
EXIT = 'exit'
20+
SESSION_RETIRED = 'sessionRetired'
2021

2122

2223
@dataclass

src/crawlee/sessions/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .session import Session
2+
from .session_pool import SessionPool

src/crawlee/sessions/models.py

+58
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# ruff: noqa: TCH002 TCH003
2+
3+
from __future__ import annotations
4+
5+
from datetime import datetime, timedelta
6+
from typing import Annotated, Any
7+
8+
from dateutil import parser
9+
from pydantic import BaseModel, ConfigDict, Field, field_validator
10+
11+
12+
class SessionModel(BaseModel):
13+
"""Model for a Session object."""
14+
15+
model_config = ConfigDict(populate_by_name=True)
16+
17+
id: Annotated[str, Field(alias='id')]
18+
max_age: Annotated[timedelta, Field(alias='maxAge')]
19+
user_data: Annotated[dict, Field(alias='userData')]
20+
max_error_score: Annotated[float, Field(alias='maxErrorScore')]
21+
error_score_decrement: Annotated[float, Field(alias='errorScoreDecrement')]
22+
created_at: Annotated[datetime, Field(alias='createdAt')]
23+
usage_count: Annotated[int, Field(alias='usageCount')]
24+
max_usage_count: Annotated[int, Field(alias='maxUsageCount')]
25+
error_score: Annotated[float, Field(alias='errorScore')]
26+
cookies: Annotated[dict, Field(alias='cookies')]
27+
blocked_status_codes: Annotated[list[int], Field(alias='blockedStatusCodes')]
28+
29+
@field_validator('max_age', mode='before')
30+
@classmethod
31+
def parse_max_age(cls, value: Any) -> timedelta:
32+
"""Try to parse max_age field into a timedelta object."""
33+
if isinstance(value, timedelta):
34+
return value
35+
36+
if isinstance(value, str):
37+
try:
38+
parsed_time = parser.parse(value)
39+
return timedelta(hours=parsed_time.hour, minutes=parsed_time.minute, seconds=parsed_time.second)
40+
except ValueError as exc:
41+
raise ValueError(f"Invalid time format for max_age. Expected 'HH:MM:SS', got {value}") from exc
42+
43+
raise ValueError('Invalid data type for max_age')
44+
45+
46+
class SessionPoolModel(BaseModel):
47+
"""Model for a SessionPool object."""
48+
49+
model_config = ConfigDict(populate_by_name=True)
50+
51+
persistence_enabled: Annotated[bool, Field(alias='persistenceEnabled')]
52+
persist_state_kvs_name: Annotated[str, Field(alias='persistStateKvsName')]
53+
persist_state_key: Annotated[str, Field(alias='persistStateKey')]
54+
max_pool_size: Annotated[int, Field(alias='maxPoolSize')]
55+
session_count: Annotated[int, Field(alias='sessionCount')]
56+
usable_session_count: Annotated[int, Field(alias='usableSessionCount')]
57+
retired_session_count: Annotated[int, Field(alias='retiredSessionCount')]
58+
sessions: Annotated[list[SessionModel], Field(alias='sessions')]

0 commit comments

Comments
 (0)