7
7
import sys
8
8
import tempfile
9
9
import threading
10
+ import traceback
10
11
from asyncio import CancelledError
11
12
from collections .abc import AsyncGenerator , Awaitable , Iterable , Sequence
12
13
from contextlib import AsyncExitStack , suppress
55
56
from crawlee .storages import Dataset , KeyValueStore , RequestQueue
56
57
57
58
from ._context_pipeline import ContextPipeline
59
+ from ._logging_utils import (
60
+ get_one_line_error_summary_if_possible ,
61
+ reduce_asyncio_timeout_error_to_relevant_traceback_parts ,
62
+ )
58
63
59
64
if TYPE_CHECKING :
60
65
import re
@@ -218,6 +223,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
218
223
"""
219
224
220
225
_CRAWLEE_STATE_KEY = 'CRAWLEE_STATE'
226
+ _request_handler_timeout_text = 'Request handler timed out after'
221
227
222
228
def __init__ (
223
229
self ,
@@ -921,6 +927,10 @@ async def _handle_request_retries(
921
927
922
928
if self ._should_retry_request (context , error ):
923
929
request .retry_count += 1
930
+ self .log .warning (
931
+ f'Retrying request to { context .request .url } due to: { error } \n '
932
+ f'{ get_one_line_error_summary_if_possible (error )} '
933
+ )
924
934
await self ._statistics .error_tracker .add (error = error , context = context )
925
935
926
936
if self ._error_handler :
@@ -974,7 +984,10 @@ async def _handle_request_error(self, context: TCrawlingContext | BasicCrawlingC
974
984
context .session .mark_bad ()
975
985
976
986
async def _handle_failed_request (self , context : TCrawlingContext | BasicCrawlingContext , error : Exception ) -> None :
977
- self ._logger .exception ('Request failed and reached maximum retries' , exc_info = error )
987
+ self ._logger .error (
988
+ f'Request to { context .request .url } failed and reached maximum retries\n '
989
+ f'{ self ._get_message_from_error (error )} '
990
+ )
978
991
await self ._statistics .error_tracker .add (error = error , context = context )
979
992
980
993
if self ._failed_request_handler :
@@ -983,6 +996,32 @@ async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawling
983
996
except Exception as e :
984
997
raise UserDefinedErrorHandlerError ('Exception thrown in user-defined failed request handler' ) from e
985
998
999
+ def _get_message_from_error (self , error : Exception ) -> str :
1000
+ """Get error message summary from exception.
1001
+
1002
+ Custom processing to reduce the irrelevant traceback clutter in some cases.
1003
+ """
1004
+ traceback_parts = traceback .format_exception (type (error ), value = error , tb = error .__traceback__ , chain = True )
1005
+ used_traceback_parts = traceback_parts
1006
+
1007
+ if (
1008
+ isinstance (error , asyncio .exceptions .TimeoutError )
1009
+ and self ._request_handler_timeout_text in traceback_parts [- 1 ]
1010
+ ):
1011
+ used_traceback_parts = reduce_asyncio_timeout_error_to_relevant_traceback_parts (error )
1012
+ used_traceback_parts .append (traceback_parts [- 1 ])
1013
+
1014
+ return '' .join (used_traceback_parts ).strip ('\n ' )
1015
+
1016
+ def _get_only_inner_most_exception (self , error : BaseException ) -> BaseException :
1017
+ """Get innermost exception by following __cause__ and __context__ attributes of exception."""
1018
+ if error .__cause__ :
1019
+ return self ._get_only_inner_most_exception (error .__cause__ )
1020
+ if error .__context__ :
1021
+ return self ._get_only_inner_most_exception (error .__context__ )
1022
+ # No __cause__ and no __context__, this is as deep as it can get.
1023
+ return error
1024
+
986
1025
def _prepare_send_request_function (
987
1026
self ,
988
1027
session : Session | None ,
@@ -1252,7 +1291,8 @@ async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
1252
1291
await wait_for (
1253
1292
lambda : self ._context_pipeline (context , self .router ),
1254
1293
timeout = self ._request_handler_timeout ,
1255
- timeout_message = f'Request handler timed out after { self ._request_handler_timeout .total_seconds ()} seconds' ,
1294
+ timeout_message = f'{ self ._request_handler_timeout_text } '
1295
+ f' { self ._request_handler_timeout .total_seconds ()} seconds' ,
1256
1296
logger = self ._logger ,
1257
1297
)
1258
1298
0 commit comments