Skip to content

Commit 78ea9d2

Browse files
authored
feat: Make timeout error log cleaner (#1170)
### Description Make timeout based error logs in request handler more readable by filtering clutter in stack trace. Add one line warning with optional most important stack trace line for failed request handler calls. ### Issues - Closes: #1158
1 parent c40f318 commit 78ea9d2

File tree

3 files changed

+128
-2
lines changed

3 files changed

+128
-2
lines changed

src/crawlee/crawlers/_basic/_basic_crawler.py

+42-2
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import sys
88
import tempfile
99
import threading
10+
import traceback
1011
from asyncio import CancelledError
1112
from collections.abc import AsyncGenerator, Awaitable, Iterable, Sequence
1213
from contextlib import AsyncExitStack, suppress
@@ -55,6 +56,10 @@
5556
from crawlee.storages import Dataset, KeyValueStore, RequestQueue
5657

5758
from ._context_pipeline import ContextPipeline
59+
from ._logging_utils import (
60+
get_one_line_error_summary_if_possible,
61+
reduce_asyncio_timeout_error_to_relevant_traceback_parts,
62+
)
5863

5964
if TYPE_CHECKING:
6065
import re
@@ -218,6 +223,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
218223
"""
219224

220225
_CRAWLEE_STATE_KEY = 'CRAWLEE_STATE'
226+
_request_handler_timeout_text = 'Request handler timed out after'
221227

222228
def __init__(
223229
self,
@@ -921,6 +927,10 @@ async def _handle_request_retries(
921927

922928
if self._should_retry_request(context, error):
923929
request.retry_count += 1
930+
self.log.warning(
931+
f'Retrying request to {context.request.url} due to: {error} \n'
932+
f'{get_one_line_error_summary_if_possible(error)}'
933+
)
924934
await self._statistics.error_tracker.add(error=error, context=context)
925935

926936
if self._error_handler:
@@ -974,7 +984,10 @@ async def _handle_request_error(self, context: TCrawlingContext | BasicCrawlingC
974984
context.session.mark_bad()
975985

976986
async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawlingContext, error: Exception) -> None:
977-
self._logger.exception('Request failed and reached maximum retries', exc_info=error)
987+
self._logger.error(
988+
f'Request to {context.request.url} failed and reached maximum retries\n '
989+
f'{self._get_message_from_error(error)}'
990+
)
978991
await self._statistics.error_tracker.add(error=error, context=context)
979992

980993
if self._failed_request_handler:
@@ -983,6 +996,32 @@ async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawling
983996
except Exception as e:
984997
raise UserDefinedErrorHandlerError('Exception thrown in user-defined failed request handler') from e
985998

999+
def _get_message_from_error(self, error: Exception) -> str:
1000+
"""Get error message summary from exception.
1001+
1002+
Custom processing to reduce the irrelevant traceback clutter in some cases.
1003+
"""
1004+
traceback_parts = traceback.format_exception(type(error), value=error, tb=error.__traceback__, chain=True)
1005+
used_traceback_parts = traceback_parts
1006+
1007+
if (
1008+
isinstance(error, asyncio.exceptions.TimeoutError)
1009+
and self._request_handler_timeout_text in traceback_parts[-1]
1010+
):
1011+
used_traceback_parts = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
1012+
used_traceback_parts.append(traceback_parts[-1])
1013+
1014+
return ''.join(used_traceback_parts).strip('\n')
1015+
1016+
def _get_only_inner_most_exception(self, error: BaseException) -> BaseException:
1017+
"""Get innermost exception by following __cause__ and __context__ attributes of exception."""
1018+
if error.__cause__:
1019+
return self._get_only_inner_most_exception(error.__cause__)
1020+
if error.__context__:
1021+
return self._get_only_inner_most_exception(error.__context__)
1022+
# No __cause__ and no __context__, this is as deep as it can get.
1023+
return error
1024+
9861025
def _prepare_send_request_function(
9871026
self,
9881027
session: Session | None,
@@ -1252,7 +1291,8 @@ async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
12521291
await wait_for(
12531292
lambda: self._context_pipeline(context, self.router),
12541293
timeout=self._request_handler_timeout,
1255-
timeout_message=f'Request handler timed out after {self._request_handler_timeout.total_seconds()} seconds',
1294+
timeout_message=f'{self._request_handler_timeout_text}'
1295+
f' {self._request_handler_timeout.total_seconds()} seconds',
12561296
logger=self._logger,
12571297
)
12581298

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import asyncio
2+
import re
3+
import traceback
4+
5+
6+
def _get_only_innermost_exception(error: BaseException) -> BaseException:
7+
"""Get innermost exception by following __cause__ and __context__ attributes of exception."""
8+
if error.__cause__:
9+
return _get_only_innermost_exception(error.__cause__)
10+
if error.__context__:
11+
return _get_only_innermost_exception(error.__context__)
12+
# No __cause__ and no __context__, this is as deep as it can get.
13+
return error
14+
15+
16+
def _get_filtered_traceback_parts_for_asyncio_timeout_error(traceback_parts: list[str]) -> list[str]:
17+
"""Extract only the most relevant traceback parts from stack trace."""
18+
ignore_pattern = (
19+
r'([\\/]{1}asyncio[\\/]{1})|' # internal asyncio parts
20+
r'(Traceback \(most recent call last\))|' # common part of the stack trace formatting
21+
r'(asyncio\.exceptions\.CancelledError)' # internal asyncio exception
22+
)
23+
return [
24+
_strip_pep657_highlighting(traceback_part)
25+
for traceback_part in traceback_parts
26+
if not re.findall(ignore_pattern, traceback_part)
27+
]
28+
29+
30+
def _strip_pep657_highlighting(traceback_part: str) -> str:
31+
"""Remove PEP 657 highlighting from the traceback."""
32+
highlight_pattern = r'(\n\s*~*\^+~*\n)$'
33+
return re.sub(highlight_pattern, '\n', traceback_part)
34+
35+
36+
def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
37+
timeout_error: asyncio.exceptions.TimeoutError,
38+
) -> list[str]:
39+
innermost_error_traceback_parts = _get_traceback_parts_for_innermost_exception(timeout_error)
40+
return _get_filtered_traceback_parts_for_asyncio_timeout_error(innermost_error_traceback_parts)
41+
42+
43+
def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]:
44+
innermost_error = _get_only_innermost_exception(error)
45+
return traceback.format_exception(
46+
type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=True
47+
)
48+
49+
50+
def get_one_line_error_summary_if_possible(error: Exception) -> str:
51+
if isinstance(error, asyncio.exceptions.TimeoutError):
52+
most_relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1]
53+
else:
54+
traceback_parts = _get_traceback_parts_for_innermost_exception(error)
55+
# Commonly last traceback part is type of the error, and the second last part is the relevant file.
56+
# If there are not enough traceback parts, then we are not sure how to summarize the error.
57+
relevant_traceback_part_index_from_end = 2
58+
most_relevant_part = _strip_pep657_highlighting(
59+
_get_traceback_parts_for_innermost_exception(error)[-relevant_traceback_part_index_from_end]
60+
if len(traceback_parts) >= relevant_traceback_part_index_from_end
61+
else ''
62+
)
63+
64+
return most_relevant_part.strip('\n ').replace('\n', ', ')

tests/unit/crawlers/_basic/test_basic_crawler.py

+22
Original file line numberDiff line numberDiff line change
@@ -1323,3 +1323,25 @@ async def test_lock_with_get_robots_txt_file_for_url(server_url: URL) -> None:
13231323

13241324
# Check that the lock was acquired only once
13251325
assert spy.call_count == 1
1326+
1327+
1328+
async def test_reduced_logs_from_timed_out_request_handler(
1329+
monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture
1330+
) -> None:
1331+
caplog.set_level(logging.INFO)
1332+
crawler = BasicCrawler(configure_logging=False, request_handler_timeout=timedelta(seconds=1))
1333+
1334+
@crawler.router.default_handler
1335+
async def handler(context: BasicCrawlingContext) -> None:
1336+
await asyncio.sleep(10) # INJECTED DELAY
1337+
1338+
await crawler.run([Request.from_url('http://a.com/')])
1339+
1340+
for record in caplog.records:
1341+
if record.funcName == '_handle_failed_request':
1342+
full_message = (record.message or '') + (record.exc_text or '')
1343+
assert Counter(full_message)['\n'] < 10
1344+
assert '# INJECTED DELAY' in full_message
1345+
break
1346+
else:
1347+
raise AssertionError('Expected log message about request handler error was not found.')

0 commit comments

Comments
 (0)