Skip to content

Commit c80812c

Browse files
authored
ci: Use a custom httpbin instance in tests (#167)
- closes #160
1 parent 07c138e commit c80812c

File tree

13 files changed

+137
-91
lines changed

13 files changed

+137
-91
lines changed

.github/workflows/run_code_checks.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ jobs:
1818
unit_tests:
1919
name: Unit tests
2020
uses: apify/workflows/.github/workflows/python_unit_tests.yaml@main
21+
secrets:
22+
httpbin_url: https://janbuchar--httpbin.apify.actor?token=${{ secrets.APIFY_HTTPBIN_TOKEN }}
2123

2224
docs_check:
2325
name: Docs check

src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,11 @@ async def _handle_blocked_request(
147147
if self._retry_on_blocked:
148148
status_code = context.http_response.status_code
149149

150-
if context.session and context.session.is_blocked_status_code(status_code=status_code):
150+
if (
151+
context.session
152+
and status_code not in self._http_client._ignore_http_error_status_codes # noqa: SLF001
153+
and context.session.is_blocked_status_code(status_code=status_code)
154+
):
151155
raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}')
152156

153157
matched_selectors = [

src/crawlee/http_crawler/_http_crawler.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,11 @@ async def _handle_blocked_request(self, context: HttpCrawlingContext) -> AsyncGe
127127
if self._retry_on_blocked:
128128
status_code = context.http_response.status_code
129129

130-
if context.session and context.session.is_blocked_status_code(status_code=status_code):
130+
if (
131+
context.session
132+
and status_code not in self._http_client._ignore_http_error_status_codes # noqa: SLF001
133+
and context.session.is_blocked_status_code(status_code=status_code)
134+
):
131135
raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}')
132136

133137
yield context

src/crawlee/parsel_crawler/_parsel_crawler.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,11 @@ async def _handle_blocked_request(
140140
if self._retry_on_blocked:
141141
status_code = context.http_response.status_code
142142

143-
if context.session and context.session.is_blocked_status_code(status_code=status_code):
143+
if (
144+
context.session
145+
and status_code not in self._http_client._ignore_http_error_status_codes # noqa: SLF001
146+
and context.session.is_blocked_status_code(status_code=status_code)
147+
):
144148
raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}')
145149

146150
parsel = context.selector

tests/unit/basic_crawler/test_basic_crawler.py

+24-12
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
import httpx
1515
import pytest
16+
from httpx import URL
1617

1718
from crawlee import ConcurrencySettings, EnqueueStrategy, Glob
1819
from crawlee._request import BaseRequestData, Request
@@ -526,22 +527,27 @@ async def test_crawler_get_storages() -> None:
526527
assert isinstance(kvs, KeyValueStore)
527528

528529

529-
async def test_crawler_run_requests(httpbin: str) -> None:
530+
async def test_crawler_run_requests(httpbin: URL) -> None:
530531
crawler = BasicCrawler()
531532
seen_urls = list[str]()
532533

533534
@crawler.router.default_handler
534535
async def handler(context: BasicCrawlingContext) -> None:
535536
seen_urls.append(context.request.url)
536537

537-
stats = await crawler.run([f'{httpbin}/1', f'{httpbin}/2', f'{httpbin}/3'])
538+
start_urls = [
539+
str(httpbin.copy_with(path='/1')),
540+
str(httpbin.copy_with(path='/2')),
541+
str(httpbin.copy_with(path='/3')),
542+
]
543+
stats = await crawler.run(start_urls)
538544

539-
assert seen_urls == [f'{httpbin}/1', f'{httpbin}/2', f'{httpbin}/3']
545+
assert seen_urls == start_urls
540546
assert stats.requests_total == 3
541547
assert stats.requests_finished == 3
542548

543549

544-
async def test_context_push_and_get_data(httpbin: str) -> None:
550+
async def test_context_push_and_get_data(httpbin: URL) -> None:
545551
crawler = BasicCrawler()
546552
dataset = await Dataset.open()
547553

@@ -555,7 +561,7 @@ async def handler(context: BasicCrawlingContext) -> None:
555561
await dataset.push_data('{"c": 3}')
556562
assert (await crawler.get_data()).items == [{'a': 1}, {'c': 3}]
557563

558-
stats = await crawler.run([f'{httpbin}/1'])
564+
stats = await crawler.run([str(httpbin.copy_with(path='/1'))])
559565

560566
assert (await crawler.get_data()).items == [{'a': 1}, {'c': 3}, {'b': 2}]
561567
assert stats.requests_total == 1
@@ -596,15 +602,15 @@ async def test_crawler_push_and_export_data(tmp_path: Path) -> None:
596602
assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\r\n0,test\r\n1,test\r\n2,test\r\n'
597603

598604

599-
async def test_context_push_and_export_data(httpbin: str, tmp_path: Path) -> None:
605+
async def test_context_push_and_export_data(httpbin: URL, tmp_path: Path) -> None:
600606
crawler = BasicCrawler()
601607

602608
@crawler.router.default_handler
603609
async def handler(context: BasicCrawlingContext) -> None:
604610
await context.push_data([{'id': 0, 'test': 'test'}, {'id': 1, 'test': 'test'}])
605611
await context.push_data({'id': 2, 'test': 'test'})
606612

607-
await crawler.run([f'{httpbin}/1'])
613+
await crawler.run([str(httpbin.copy_with(path='/1'))])
608614

609615
await crawler.export_data_json(path=tmp_path / 'dataset.json')
610616
await crawler.export_data_csv(path=tmp_path / 'dataset.csv')
@@ -618,15 +624,15 @@ async def handler(context: BasicCrawlingContext) -> None:
618624
assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\r\n0,test\r\n1,test\r\n2,test\r\n'
619625

620626

621-
async def test_crawler_push_and_export_data_and_json_dump_parameter(httpbin: str, tmp_path: Path) -> None:
627+
async def test_crawler_push_and_export_data_and_json_dump_parameter(httpbin: URL, tmp_path: Path) -> None:
622628
crawler = BasicCrawler()
623629

624630
@crawler.router.default_handler
625631
async def handler(context: BasicCrawlingContext) -> None:
626632
await context.push_data([{'id': 0, 'test': 'test'}, {'id': 1, 'test': 'test'}])
627633
await context.push_data({'id': 2, 'test': 'test'})
628634

629-
await crawler.run([f'{httpbin}/1'])
635+
await crawler.run([str(httpbin.copy_with(path='/1'))])
630636

631637
await crawler.export_data_json(path=tmp_path / 'dataset.json', indent=3)
632638

@@ -671,8 +677,14 @@ async def handler(context: BasicCrawlingContext) -> None:
671677
assert (await store.get_value('foo')) == 'bar'
672678

673679

674-
async def test_max_requests_per_crawl(httpbin: str) -> None:
675-
start_urls = [f'{httpbin}/1', f'{httpbin}/2', f'{httpbin}/3', f'{httpbin}/4', f'{httpbin}/5']
680+
async def test_max_requests_per_crawl(httpbin: URL) -> None:
681+
start_urls = [
682+
str(httpbin.copy_with(path='/1')),
683+
str(httpbin.copy_with(path='/2')),
684+
str(httpbin.copy_with(path='/3')),
685+
str(httpbin.copy_with(path='/4')),
686+
str(httpbin.copy_with(path='/5')),
687+
]
676688
processed_urls = []
677689

678690
# Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately
@@ -693,7 +705,7 @@ async def handler(context: BasicCrawlingContext) -> None:
693705
assert stats.requests_finished == 3
694706

695707

696-
async def test_max_crawl_depth(httpbin: str) -> None:
708+
async def test_max_crawl_depth(httpbin: URL) -> None:
697709
processed_urls = []
698710

699711
start_request = Request.from_url('https://someplace.com/', label='start')

tests/unit/browsers/test_browser_pool.py

+28-23
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,58 @@
11
from __future__ import annotations
22

3+
from typing import TYPE_CHECKING
4+
35
import pytest
46

57
from crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin
68

9+
if TYPE_CHECKING:
10+
from httpx import URL
11+
712

8-
async def test_default_plugin_new_page_creation(httpbin: str) -> None:
13+
async def test_default_plugin_new_page_creation(httpbin: URL) -> None:
914
async with BrowserPool() as browser_pool:
1015
page_1 = await browser_pool.new_page()
11-
await page_1.page.goto(f'{httpbin}/get')
16+
await page_1.page.goto(str(httpbin.copy_with(path='/get')))
1217
assert page_1.browser_type == 'chromium'
13-
assert page_1.page.url == f'{httpbin}/get'
18+
assert page_1.page.url == str(httpbin.copy_with(path='/get'))
1419
assert '<html' in await page_1.page.content() # there is some HTML content
1520
assert browser_pool.total_pages_count == 1
1621

1722
page_2 = await browser_pool.new_page()
18-
await page_2.page.goto(f'{httpbin}/status/200')
23+
await page_2.page.goto(str(httpbin.copy_with(path='/status/200')))
1924
assert page_2.browser_type == 'chromium'
20-
assert page_2.page.url == f'{httpbin}/status/200'
25+
assert page_2.page.url == str(httpbin.copy_with(path='/status/200'))
2126
assert '<html' in await page_1.page.content() # there is some HTML content
2227
assert browser_pool.total_pages_count == 2
2328

2429
await page_1.page.close()
2530
await page_2.page.close()
2631

2732

28-
async def test_multiple_plugins_new_page_creation(httpbin: str) -> None:
33+
async def test_multiple_plugins_new_page_creation(httpbin: URL) -> None:
2934
plugin_chromium = PlaywrightBrowserPlugin(browser_type='chromium')
3035
plugin_firefox = PlaywrightBrowserPlugin(browser_type='firefox')
3136

3237
async with BrowserPool([plugin_chromium, plugin_firefox]) as browser_pool:
3338
assert browser_pool.plugins == [plugin_chromium, plugin_firefox]
3439

3540
page_1 = await browser_pool.new_page()
36-
await page_1.page.goto(f'{httpbin}/get')
41+
await page_1.page.goto(str(httpbin.copy_with(path='/get')))
3742
assert page_1.browser_type == 'chromium'
38-
assert page_1.page.url == f'{httpbin}/get'
43+
assert page_1.page.url == str(httpbin.copy_with(path='/get'))
3944
assert '<html' in await page_1.page.content() # there is some HTML content
4045

4146
page_2 = await browser_pool.new_page()
42-
await page_2.page.goto(f'{httpbin}/headers')
47+
await page_2.page.goto(str(httpbin.copy_with(path='/headers')))
4348
assert page_2.browser_type == 'firefox'
44-
assert page_2.page.url == f'{httpbin}/headers'
49+
assert page_2.page.url == str(httpbin.copy_with(path='/headers'))
4550
assert '<html' in await page_2.page.content() # there is some HTML content
4651

4752
page_3 = await browser_pool.new_page()
48-
await page_3.page.goto(f'{httpbin}/user-agent')
53+
await page_3.page.goto(str(httpbin.copy_with(path='/user-agent')))
4954
assert page_3.browser_type == 'chromium'
50-
assert page_3.page.url == f'{httpbin}/user-agent'
55+
assert page_3.page.url == str(httpbin.copy_with(path='/user-agent'))
5156
assert '<html' in await page_3.page.content() # there is some HTML content
5257

5358
await page_1.page.close()
@@ -57,7 +62,7 @@ async def test_multiple_plugins_new_page_creation(httpbin: str) -> None:
5762
assert browser_pool.total_pages_count == 3
5863

5964

60-
async def test_new_page_with_each_plugin(httpbin: str) -> None:
65+
async def test_new_page_with_each_plugin(httpbin: URL) -> None:
6166
plugin_chromium = PlaywrightBrowserPlugin(browser_type='chromium')
6267
plugin_firefox = PlaywrightBrowserPlugin(browser_type='firefox')
6368

@@ -69,12 +74,12 @@ async def test_new_page_with_each_plugin(httpbin: str) -> None:
6974
assert pages[0].browser_type == 'chromium'
7075
assert pages[1].browser_type == 'firefox'
7176

72-
await pages[0].page.goto(f'{httpbin}/get')
73-
assert pages[0].page.url == f'{httpbin}/get'
77+
await pages[0].page.goto(str(httpbin.copy_with(path='/get')))
78+
assert pages[0].page.url == str(httpbin.copy_with(path='/get'))
7479
assert '<html' in await pages[0].page.content() # there is some HTML content
7580

76-
await pages[1].page.goto(f'{httpbin}/headers')
77-
assert pages[1].page.url == f'{httpbin}/headers'
81+
await pages[1].page.goto(str(httpbin.copy_with(path='/headers')))
82+
assert pages[1].page.url == str(httpbin.copy_with(path='/headers'))
7883
assert '<html' in await pages[1].page.content()
7984

8085
for page in pages:
@@ -83,16 +88,16 @@ async def test_new_page_with_each_plugin(httpbin: str) -> None:
8388
assert browser_pool.total_pages_count == 2
8489

8590

86-
async def test_with_default_plugin_constructor(httpbin: str) -> None:
91+
async def test_with_default_plugin_constructor(httpbin: URL) -> None:
8792
async with BrowserPool.with_default_plugin(headless=True, browser_type='firefox') as browser_pool:
8893
assert len(browser_pool.plugins) == 1
8994
assert isinstance(browser_pool.plugins[0], PlaywrightBrowserPlugin)
9095

9196
page = await browser_pool.new_page()
9297
assert page.browser_type == 'firefox'
9398

94-
await page.page.goto(f'{httpbin}/get')
95-
assert page.page.url == f'{httpbin}/get'
99+
await page.page.goto(str(httpbin.copy_with(path='/get')))
100+
assert page.page.url == str(httpbin.copy_with(path='/get'))
96101
assert '<html' in await page.page.content() # there is some HTML content
97102

98103
await page.page.close()
@@ -114,13 +119,13 @@ async def test_new_page_with_invalid_plugin() -> None:
114119
await browser_pool.new_page(browser_plugin=plugin_2)
115120

116121

117-
async def test_resource_management(httpbin: str) -> None:
122+
async def test_resource_management(httpbin: URL) -> None:
118123
playwright_plugin = PlaywrightBrowserPlugin(browser_type='chromium')
119124

120125
async with BrowserPool([playwright_plugin]) as browser_pool:
121126
page = await browser_pool.new_page()
122-
await page.page.goto(f'{httpbin}/get')
123-
assert page.page.url == f'{httpbin}/get'
127+
await page.page.goto(str(httpbin.copy_with(path='/get')))
128+
assert page.page.url == str(httpbin.copy_with(path='/get'))
124129
assert '<html' in await page.page.content() # there is some HTML content
125130
assert browser_pool.total_pages_count == 1
126131

tests/unit/browsers/test_playwright_browser_controller.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,16 @@
22

33
import asyncio
44
from datetime import datetime, timedelta, timezone
5-
from typing import AsyncGenerator
5+
from typing import TYPE_CHECKING, AsyncGenerator
66

77
import pytest
88
from playwright.async_api import Browser, Playwright, async_playwright
99

1010
from crawlee.browsers import PlaywrightBrowserController
1111

12+
if TYPE_CHECKING:
13+
from httpx import URL
14+
1215

1316
@pytest.fixture
1417
async def playwright() -> AsyncGenerator[Playwright, None]:
@@ -41,7 +44,7 @@ async def test_initial_state(browser: Browser) -> None:
4144
assert controller.has_free_capacity
4245

4346

44-
async def test_open_and_close_page(controller: PlaywrightBrowserController, httpbin: str) -> None:
47+
async def test_open_and_close_page(controller: PlaywrightBrowserController, httpbin: URL) -> None:
4548
page = await controller.new_page()
4649
await page.goto(f'{httpbin}')
4750

tests/unit/browsers/test_playwright_browser_plugin.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
from __future__ import annotations
22

3-
from typing import AsyncGenerator
3+
from typing import TYPE_CHECKING, AsyncGenerator
44

55
import pytest
66

77
from crawlee.browsers import PlaywrightBrowserPlugin
88

9+
if TYPE_CHECKING:
10+
from httpx import URL
11+
912

1013
@pytest.fixture
1114
async def plugin() -> AsyncGenerator[PlaywrightBrowserPlugin, None]:
@@ -28,7 +31,7 @@ async def test_initial_state() -> None:
2831
assert plugin.max_open_pages_per_browser == 10
2932

3033

31-
async def test_new_browser(plugin: PlaywrightBrowserPlugin, httpbin: str) -> None:
34+
async def test_new_browser(plugin: PlaywrightBrowserPlugin, httpbin: URL) -> None:
3235
browser_controller = await plugin.new_browser()
3336

3437
assert browser_controller.is_browser_connected

tests/unit/conftest.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from typing import TYPE_CHECKING, Callable, cast
88

99
import pytest
10+
from httpx import URL
1011
from proxy import Proxy
1112

1213
from crawlee import service_container
@@ -68,8 +69,8 @@ def memory_storage_client(tmp_path: Path) -> MemoryStorageClient:
6869

6970

7071
@pytest.fixture
71-
def httpbin() -> str:
72-
return os.environ.get('HTTPBIN_URL', 'https://httpbin.org')
72+
def httpbin() -> URL:
73+
return URL(os.environ.get('HTTPBIN_URL', 'https://httpbin.org'))
7374

7475

7576
@pytest.fixture

0 commit comments

Comments
 (0)