ci: Use a custom httpbin instance in tests (#167)

janbuchar · web-flow · commit c80812cb3a4c · 2024-11-18T16:02:32.000+01:00
- closes #160
diff --git a/.github/workflows/run_code_checks.yaml b/.github/workflows/run_code_checks.yaml
@@ -18,6 +18,8 @@ jobs:
   unit_tests:
     name: Unit tests
     uses: apify/workflows/.github/workflows/python_unit_tests.yaml@main
+    secrets:
+      httpbin_url: https://janbuchar--httpbin.apify.actor?token=${{ secrets.APIFY_HTTPBIN_TOKEN }}
 
   docs_check:
     name: Docs check
diff --git a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py
@@ -147,7 +147,11 @@ async def _handle_blocked_request(
         if self._retry_on_blocked:
             status_code = context.http_response.status_code
 
-            if context.session and context.session.is_blocked_status_code(status_code=status_code):
+            if (
+                context.session
+                and status_code not in self._http_client._ignore_http_error_status_codes  # noqa: SLF001
+                and context.session.is_blocked_status_code(status_code=status_code)
+            ):
                 raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}')
 
             matched_selectors = [
diff --git a/src/crawlee/http_crawler/_http_crawler.py b/src/crawlee/http_crawler/_http_crawler.py
@@ -127,7 +127,11 @@ async def _handle_blocked_request(self, context: HttpCrawlingContext) -> AsyncGe
         if self._retry_on_blocked:
             status_code = context.http_response.status_code
 
-            if context.session and context.session.is_blocked_status_code(status_code=status_code):
+            if (
+                context.session
+                and status_code not in self._http_client._ignore_http_error_status_codes  # noqa: SLF001
+                and context.session.is_blocked_status_code(status_code=status_code)
+            ):
                 raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}')
 
         yield context
diff --git a/src/crawlee/parsel_crawler/_parsel_crawler.py b/src/crawlee/parsel_crawler/_parsel_crawler.py
@@ -140,7 +140,11 @@ async def _handle_blocked_request(
         if self._retry_on_blocked:
             status_code = context.http_response.status_code
 
-            if context.session and context.session.is_blocked_status_code(status_code=status_code):
+            if (
+                context.session
+                and status_code not in self._http_client._ignore_http_error_status_codes  # noqa: SLF001
+                and context.session.is_blocked_status_code(status_code=status_code)
+            ):
                 raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}')
 
             parsel = context.selector
diff --git a/tests/unit/basic_crawler/test_basic_crawler.py b/tests/unit/basic_crawler/test_basic_crawler.py
@@ -13,6 +13,7 @@
 
 import httpx
 import pytest
+from httpx import URL
 
 from crawlee import ConcurrencySettings, EnqueueStrategy, Glob
 from crawlee._request import BaseRequestData, Request
@@ -526,22 +527,27 @@ async def test_crawler_get_storages() -> None:
     assert isinstance(kvs, KeyValueStore)
 
 
-async def test_crawler_run_requests(httpbin: str) -> None:
+async def test_crawler_run_requests(httpbin: URL) -> None:
     crawler = BasicCrawler()
     seen_urls = list[str]()
 
     @crawler.router.default_handler
     async def handler(context: BasicCrawlingContext) -> None:
         seen_urls.append(context.request.url)
 
-    stats = await crawler.run([f'{httpbin}/1', f'{httpbin}/2', f'{httpbin}/3'])
+    start_urls = [
+        str(httpbin.copy_with(path='/1')),
+        str(httpbin.copy_with(path='/2')),
+        str(httpbin.copy_with(path='/3')),
+    ]
+    stats = await crawler.run(start_urls)
 
-    assert seen_urls == [f'{httpbin}/1', f'{httpbin}/2', f'{httpbin}/3']
+    assert seen_urls == start_urls
     assert stats.requests_total == 3
     assert stats.requests_finished == 3
 
 
-async def test_context_push_and_get_data(httpbin: str) -> None:
+async def test_context_push_and_get_data(httpbin: URL) -> None:
     crawler = BasicCrawler()
     dataset = await Dataset.open()
 
@@ -555,7 +561,7 @@ async def handler(context: BasicCrawlingContext) -> None:
     await dataset.push_data('{"c": 3}')
     assert (await crawler.get_data()).items == [{'a': 1}, {'c': 3}]
 
-    stats = await crawler.run([f'{httpbin}/1'])
+    stats = await crawler.run([str(httpbin.copy_with(path='/1'))])
 
     assert (await crawler.get_data()).items == [{'a': 1}, {'c': 3}, {'b': 2}]
     assert stats.requests_total == 1
@@ -596,15 +602,15 @@ async def test_crawler_push_and_export_data(tmp_path: Path) -> None:
     assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\r\n0,test\r\n1,test\r\n2,test\r\n'
 
 
-async def test_context_push_and_export_data(httpbin: str, tmp_path: Path) -> None:
+async def test_context_push_and_export_data(httpbin: URL, tmp_path: Path) -> None:
     crawler = BasicCrawler()
 
     @crawler.router.default_handler
     async def handler(context: BasicCrawlingContext) -> None:
         await context.push_data([{'id': 0, 'test': 'test'}, {'id': 1, 'test': 'test'}])
         await context.push_data({'id': 2, 'test': 'test'})
 
-    await crawler.run([f'{httpbin}/1'])
+    await crawler.run([str(httpbin.copy_with(path='/1'))])
 
     await crawler.export_data_json(path=tmp_path / 'dataset.json')
     await crawler.export_data_csv(path=tmp_path / 'dataset.csv')
@@ -618,15 +624,15 @@ async def handler(context: BasicCrawlingContext) -> None:
     assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\r\n0,test\r\n1,test\r\n2,test\r\n'
 
 
-async def test_crawler_push_and_export_data_and_json_dump_parameter(httpbin: str, tmp_path: Path) -> None:
+async def test_crawler_push_and_export_data_and_json_dump_parameter(httpbin: URL, tmp_path: Path) -> None:
     crawler = BasicCrawler()
 
     @crawler.router.default_handler
     async def handler(context: BasicCrawlingContext) -> None:
         await context.push_data([{'id': 0, 'test': 'test'}, {'id': 1, 'test': 'test'}])
         await context.push_data({'id': 2, 'test': 'test'})
 
-    await crawler.run([f'{httpbin}/1'])
+    await crawler.run([str(httpbin.copy_with(path='/1'))])
 
     await crawler.export_data_json(path=tmp_path / 'dataset.json', indent=3)
 
@@ -671,8 +677,14 @@ async def handler(context: BasicCrawlingContext) -> None:
     assert (await store.get_value('foo')) == 'bar'
 
 
-async def test_max_requests_per_crawl(httpbin: str) -> None:
-    start_urls = [f'{httpbin}/1', f'{httpbin}/2', f'{httpbin}/3', f'{httpbin}/4', f'{httpbin}/5']
+async def test_max_requests_per_crawl(httpbin: URL) -> None:
+    start_urls = [
+        str(httpbin.copy_with(path='/1')),
+        str(httpbin.copy_with(path='/2')),
+        str(httpbin.copy_with(path='/3')),
+        str(httpbin.copy_with(path='/4')),
+        str(httpbin.copy_with(path='/5')),
+    ]
     processed_urls = []
 
     # Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately
@@ -693,7 +705,7 @@ async def handler(context: BasicCrawlingContext) -> None:
     assert stats.requests_finished == 3
 
 
-async def test_max_crawl_depth(httpbin: str) -> None:
+async def test_max_crawl_depth(httpbin: URL) -> None:
     processed_urls = []
 
     start_request = Request.from_url('https://someplace.com/', label='start')
diff --git a/tests/unit/browsers/test_browser_pool.py b/tests/unit/browsers/test_browser_pool.py
@@ -1,53 +1,58 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
 import pytest
 
 from crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin
 
+if TYPE_CHECKING:
+    from httpx import URL
+
 
-async def test_default_plugin_new_page_creation(httpbin: str) -> None:
+async def test_default_plugin_new_page_creation(httpbin: URL) -> None:
     async with BrowserPool() as browser_pool:
         page_1 = await browser_pool.new_page()
-        await page_1.page.goto(f'{httpbin}/get')
+        await page_1.page.goto(str(httpbin.copy_with(path='/get')))
         assert page_1.browser_type == 'chromium'
-        assert page_1.page.url == f'{httpbin}/get'
+        assert page_1.page.url == str(httpbin.copy_with(path='/get'))
         assert '<html' in await page_1.page.content()  # there is some HTML content
         assert browser_pool.total_pages_count == 1
 
         page_2 = await browser_pool.new_page()
-        await page_2.page.goto(f'{httpbin}/status/200')
+        await page_2.page.goto(str(httpbin.copy_with(path='/status/200')))
         assert page_2.browser_type == 'chromium'
-        assert page_2.page.url == f'{httpbin}/status/200'
+        assert page_2.page.url == str(httpbin.copy_with(path='/status/200'))
         assert '<html' in await page_1.page.content()  # there is some HTML content
         assert browser_pool.total_pages_count == 2
 
         await page_1.page.close()
         await page_2.page.close()
 
 
-async def test_multiple_plugins_new_page_creation(httpbin: str) -> None:
+async def test_multiple_plugins_new_page_creation(httpbin: URL) -> None:
     plugin_chromium = PlaywrightBrowserPlugin(browser_type='chromium')
     plugin_firefox = PlaywrightBrowserPlugin(browser_type='firefox')
 
     async with BrowserPool([plugin_chromium, plugin_firefox]) as browser_pool:
         assert browser_pool.plugins == [plugin_chromium, plugin_firefox]
 
         page_1 = await browser_pool.new_page()
-        await page_1.page.goto(f'{httpbin}/get')
+        await page_1.page.goto(str(httpbin.copy_with(path='/get')))
         assert page_1.browser_type == 'chromium'
-        assert page_1.page.url == f'{httpbin}/get'
+        assert page_1.page.url == str(httpbin.copy_with(path='/get'))
         assert '<html' in await page_1.page.content()  # there is some HTML content
 
         page_2 = await browser_pool.new_page()
-        await page_2.page.goto(f'{httpbin}/headers')
+        await page_2.page.goto(str(httpbin.copy_with(path='/headers')))
         assert page_2.browser_type == 'firefox'
-        assert page_2.page.url == f'{httpbin}/headers'
+        assert page_2.page.url == str(httpbin.copy_with(path='/headers'))
         assert '<html' in await page_2.page.content()  # there is some HTML content
 
         page_3 = await browser_pool.new_page()
-        await page_3.page.goto(f'{httpbin}/user-agent')
+        await page_3.page.goto(str(httpbin.copy_with(path='/user-agent')))
         assert page_3.browser_type == 'chromium'
-        assert page_3.page.url == f'{httpbin}/user-agent'
+        assert page_3.page.url == str(httpbin.copy_with(path='/user-agent'))
         assert '<html' in await page_3.page.content()  # there is some HTML content
 
         await page_1.page.close()
@@ -57,7 +62,7 @@ async def test_multiple_plugins_new_page_creation(httpbin: str) -> None:
         assert browser_pool.total_pages_count == 3
 
 
-async def test_new_page_with_each_plugin(httpbin: str) -> None:
+async def test_new_page_with_each_plugin(httpbin: URL) -> None:
     plugin_chromium = PlaywrightBrowserPlugin(browser_type='chromium')
     plugin_firefox = PlaywrightBrowserPlugin(browser_type='firefox')
 
@@ -69,12 +74,12 @@ async def test_new_page_with_each_plugin(httpbin: str) -> None:
         assert pages[0].browser_type == 'chromium'
         assert pages[1].browser_type == 'firefox'
 
-        await pages[0].page.goto(f'{httpbin}/get')
-        assert pages[0].page.url == f'{httpbin}/get'
+        await pages[0].page.goto(str(httpbin.copy_with(path='/get')))
+        assert pages[0].page.url == str(httpbin.copy_with(path='/get'))
         assert '<html' in await pages[0].page.content()  # there is some HTML content
 
-        await pages[1].page.goto(f'{httpbin}/headers')
-        assert pages[1].page.url == f'{httpbin}/headers'
+        await pages[1].page.goto(str(httpbin.copy_with(path='/headers')))
+        assert pages[1].page.url == str(httpbin.copy_with(path='/headers'))
         assert '<html' in await pages[1].page.content()
 
         for page in pages:
@@ -83,16 +88,16 @@ async def test_new_page_with_each_plugin(httpbin: str) -> None:
         assert browser_pool.total_pages_count == 2
 
 
-async def test_with_default_plugin_constructor(httpbin: str) -> None:
+async def test_with_default_plugin_constructor(httpbin: URL) -> None:
     async with BrowserPool.with_default_plugin(headless=True, browser_type='firefox') as browser_pool:
         assert len(browser_pool.plugins) == 1
         assert isinstance(browser_pool.plugins[0], PlaywrightBrowserPlugin)
 
         page = await browser_pool.new_page()
         assert page.browser_type == 'firefox'
 
-        await page.page.goto(f'{httpbin}/get')
-        assert page.page.url == f'{httpbin}/get'
+        await page.page.goto(str(httpbin.copy_with(path='/get')))
+        assert page.page.url == str(httpbin.copy_with(path='/get'))
         assert '<html' in await page.page.content()  # there is some HTML content
 
         await page.page.close()
@@ -114,13 +119,13 @@ async def test_new_page_with_invalid_plugin() -> None:
             await browser_pool.new_page(browser_plugin=plugin_2)
 
 
-async def test_resource_management(httpbin: str) -> None:
+async def test_resource_management(httpbin: URL) -> None:
     playwright_plugin = PlaywrightBrowserPlugin(browser_type='chromium')
 
     async with BrowserPool([playwright_plugin]) as browser_pool:
         page = await browser_pool.new_page()
-        await page.page.goto(f'{httpbin}/get')
-        assert page.page.url == f'{httpbin}/get'
+        await page.page.goto(str(httpbin.copy_with(path='/get')))
+        assert page.page.url == str(httpbin.copy_with(path='/get'))
         assert '<html' in await page.page.content()  # there is some HTML content
         assert browser_pool.total_pages_count == 1
 
diff --git a/tests/unit/browsers/test_playwright_browser_controller.py b/tests/unit/browsers/test_playwright_browser_controller.py
@@ -2,13 +2,16 @@
 
 import asyncio
 from datetime import datetime, timedelta, timezone
-from typing import AsyncGenerator
+from typing import TYPE_CHECKING, AsyncGenerator
 
 import pytest
 from playwright.async_api import Browser, Playwright, async_playwright
 
 from crawlee.browsers import PlaywrightBrowserController
 
+if TYPE_CHECKING:
+    from httpx import URL
+
 
 @pytest.fixture
 async def playwright() -> AsyncGenerator[Playwright, None]:
@@ -41,7 +44,7 @@ async def test_initial_state(browser: Browser) -> None:
     assert controller.has_free_capacity
 
 
-async def test_open_and_close_page(controller: PlaywrightBrowserController, httpbin: str) -> None:
+async def test_open_and_close_page(controller: PlaywrightBrowserController, httpbin: URL) -> None:
     page = await controller.new_page()
     await page.goto(f'{httpbin}')
 
diff --git a/tests/unit/browsers/test_playwright_browser_plugin.py b/tests/unit/browsers/test_playwright_browser_plugin.py
@@ -1,11 +1,14 @@
 from __future__ import annotations
 
-from typing import AsyncGenerator
+from typing import TYPE_CHECKING, AsyncGenerator
 
 import pytest
 
 from crawlee.browsers import PlaywrightBrowserPlugin
 
+if TYPE_CHECKING:
+    from httpx import URL
+
 
 @pytest.fixture
 async def plugin() -> AsyncGenerator[PlaywrightBrowserPlugin, None]:
@@ -28,7 +31,7 @@ async def test_initial_state() -> None:
     assert plugin.max_open_pages_per_browser == 10
 
 
-async def test_new_browser(plugin: PlaywrightBrowserPlugin, httpbin: str) -> None:
+async def test_new_browser(plugin: PlaywrightBrowserPlugin, httpbin: URL) -> None:
     browser_controller = await plugin.new_browser()
 
     assert browser_controller.is_browser_connected
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
@@ -7,6 +7,7 @@
 from typing import TYPE_CHECKING, Callable, cast
 
 import pytest
+from httpx import URL
 from proxy import Proxy
 
 from crawlee import service_container
@@ -68,8 +69,8 @@ def memory_storage_client(tmp_path: Path) -> MemoryStorageClient:
 
 
 @pytest.fixture
-def httpbin() -> str:
-    return os.environ.get('HTTPBIN_URL', 'https://httpbin.org')
+def httpbin() -> URL:
+    return URL(os.environ.get('HTTPBIN_URL', 'https://httpbin.org'))
 
 
 @pytest.fixture
diff --git a/tests/unit/http_clients/test_curl_impersonate.py b/tests/unit/http_clients/test_curl_impersonate.py
diff --git a/tests/unit/http_clients/test_httpx.py b/tests/unit/http_clients/test_httpx.py
diff --git a/tests/unit/http_crawler/test_http_crawler.py b/tests/unit/http_crawler/test_http_crawler.py
diff --git a/tests/unit/playwright_crawler/test_playwright_crawler.py b/tests/unit/playwright_crawler/test_playwright_crawler.py