Skip to content

Commit 7353453

Browse files
authored
feat: add support for reasoning tool use and upgrade to Qwen3 (#124)
1 parent ff95d2f commit 7353453

File tree

7 files changed

+52
-21
lines changed

7 files changed

+52
-21
lines changed

.github/workflows/test.yml

+2-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ on:
99

1010
jobs:
1111
test:
12-
runs-on: ubuntu-latest
12+
runs-on:
13+
group: raglite
1314

1415
strategy:
1516
fail-fast: false

README.md

+5-5
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ RAGLite is a Python toolkit for Retrieval-Augmented Generation (RAG) with Postgr
4444
> 🚀 If you want to use local models, it is recommended to install [an accelerated llama-cpp-python precompiled binary](https://github.com/abetlen/llama-cpp-python?tab=readme-ov-file#supported-backends) with:
4545
> ```sh
4646
> # Configure which llama-cpp-python precompiled binary to install (⚠️ not every combination is available):
47-
> LLAMA_CPP_PYTHON_VERSION=0.3.4
47+
> LLAMA_CPP_PYTHON_VERSION=0.3.9
4848
> PYTHON_VERSION=310|311|312
4949
> ACCELERATOR=metal|cu121|cu122|cu123|cu124
5050
> PLATFORM=macosx_11_0_arm64|linux_x86_64|win_amd64
@@ -92,7 +92,7 @@ pip install raglite[ragas]
9292
### 1. Configuring RAGLite
9393

9494
> [!TIP]
95-
> 🧠 RAGLite extends [LiteLLM](https://github.com/BerriAI/litellm) with support for [llama.cpp](https://github.com/ggerganov/llama.cpp) models using [llama-cpp-python](https://github.com/abetlen/llama-cpp-python). To select a llama.cpp model (e.g., from [bartowski's collection](https://huggingface.co/bartowski)), use a model identifier of the form `"llama-cpp-python/<hugging_face_repo_id>/<filename>@<n_ctx>"`, where `n_ctx` is an optional parameter that specifies the context size of the model.
95+
> 🧠 RAGLite extends [LiteLLM](https://github.com/BerriAI/litellm) with support for [llama.cpp](https://github.com/ggerganov/llama.cpp) models using [llama-cpp-python](https://github.com/abetlen/llama-cpp-python). To select a llama.cpp model (e.g., from [Unsloth's collection](https://huggingface.co/unsloth)), use a model identifier of the form `"llama-cpp-python/<hugging_face_repo_id>/<filename>@<n_ctx>"`, where `n_ctx` is an optional parameter that specifies the context size of the model.
9696
9797
> [!TIP]
9898
> 💾 You can create a PostgreSQL database in a few clicks at [neon.tech](https://neon.tech).
@@ -112,7 +112,7 @@ my_config = RAGLiteConfig(
112112
# Example 'local' config with a SQLite database and a llama.cpp LLM:
113113
my_config = RAGLiteConfig(
114114
db_url="sqlite:///raglite.db",
115-
llm="llama-cpp-python/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/*Q4_K_M.gguf@8192",
115+
llm="llama-cpp-python/unsloth/Qwen3-8B-GGUF/*Q4_K_M.gguf@8192",
116116
embedder="llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf@1024", # A context size of 1024 tokens is the sweet spot for bge-m3
117117
)
118118
```
@@ -308,7 +308,7 @@ RAGLite comes with an [MCP server](https://modelcontextprotocol.io) implemented
308308
```
309309
raglite \
310310
--db-url sqlite:///raglite.db \
311-
--llm llama-cpp-python/bartowski/Llama-3.2-3B-Instruct-GGUF/*Q4_K_M.gguf@4096 \
311+
--llm llama-cpp-python/unsloth/Qwen3-4B-GGUF/*Q4_K_M.gguf@4096 \
312312
--embedder llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf@1024 \
313313
mcp install
314314
```
@@ -344,7 +344,7 @@ You can specify the database URL, LLM, and embedder directly in the Chainlit fro
344344
```sh
345345
raglite \
346346
--db-url sqlite:///raglite.db \
347-
--llm llama-cpp-python/bartowski/Llama-3.2-3B-Instruct-GGUF/*Q4_K_M.gguf@4096 \
347+
--llm llama-cpp-python/unsloth/Qwen3-4B-GGUF/*Q4_K_M.gguf@4096 \
348348
--embedder llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf@1024 \
349349
chainlit
350350
```

pyproject.toml

+2-2
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ dependencies = [
2525
"scipy (>=1.11.2,!=1.15.0.*,!=1.15.1,!=1.15.2)",
2626
"wtpsplit-lite (>=0.1.0)",
2727
# Large Language Models:
28-
"huggingface-hub (>=0.22.0)",
28+
"huggingface-hub[hf_xet] (>=0.30.0)",
2929
"litellm (>=1.60.2)",
3030
"pydantic (>=2.7.0)",
3131
# Approximate Nearest Neighbors:
@@ -82,7 +82,7 @@ dev = [
8282
# Frontend:
8383
chainlit = ["chainlit (>=2.0.0)"]
8484
# Large Language Models:
85-
llama-cpp-python = ["llama-cpp-python (>=0.3.4)"]
85+
llama-cpp-python = ["llama-cpp-python (>=0.3.9)"]
8686
# Markdown conversion:
8787
pandoc = ["pypandoc-binary (>=1.13)"]
8888
# Evaluation:

src/raglite/_chatml_function_calling.py

+25-8
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
b. ✨ Add function descriptions to the system message so that tool use is better informed (fixes https://github.com/abetlen/llama-cpp-python/issues/1869).
77
c. ✨ Replace `print` statements relating to JSON grammars with `RuntimeWarning` warnings.
88
d. ✅ Add tests with fairly broad coverage of the different scenarios.
9+
e. 🐛 Fix a 'content' KeyError in the prompt template.
10+
f. ✨ Add support for Qwen3's <|endoftext|> separator.
11+
g. ✨ Add support for Qwen3's <think>...</think> mode to (auto and fixed) function calling.
912
4. Case "Tool choice by user":
1013
a. ✨ Add support for more than one function call by making this a special case of "Automatic tool choice" with a single tool (subsumes https://github.com/abetlen/llama-cpp-python/pull/1503).
1114
5. Case "Automatic tool choice -> respond with a message":
@@ -326,12 +329,13 @@ def chatml_function_calling_with_streaming(
326329

327330
# Collect the llama.create_completion keyword arguments so we don't have to repeat these with
328331
# each completion call
332+
default_stop = ["<|im_end|>", "<|endoftext|>"]
329333
stop = (
330-
[stop, "<|im_end|>"]
334+
[stop, *default_stop]
331335
if isinstance(stop, str)
332-
else [*stop, "<|im_end|>"]
336+
else [*stop, *default_stop]
333337
if stop
334-
else ["<|im_end|>"]
338+
else default_stop
335339
)
336340
grammar = ( # It is assumed the grammar applies to messages only, not tool calls
337341
grammar
@@ -398,11 +402,16 @@ def chatml_function_calling_with_streaming(
398402
)
399403
initial_gbnf_tool_grammar = (
400404
(
401-
'root ::= "<function_calls>" "\\n" functions | "message:"\n'
405+
'root ::= think? ("<function_calls>" "\\n" functions | "message:")\n'
402406
f"functions ::= {function_names}\n"
407+
'think ::= "<think>" [^<]* "</think>" "\\n\\n"\n'
403408
)
404409
if tool_choice == "auto"
405-
else f'root ::= "<function_calls>" "\\n" functions\nfunctions ::= {function_names}\n'
410+
else (
411+
f'root ::= think? "<function_calls>" "\\n" functions\n'
412+
f"functions ::= {function_names}\n"
413+
'think ::= "<think>" [^<]* "</think>" "\\n\\n"\n'
414+
)
406415
)
407416
completion = cast(
408417
"llama_types.CreateCompletionResponse",
@@ -412,7 +421,6 @@ def chatml_function_calling_with_streaming(
412421
**completion_kwargs,
413422
"temperature": 0,
414423
"stream": False,
415-
"stop": [":"],
416424
"max_tokens": None,
417425
"grammar": llama_grammar.LlamaGrammar.from_string(
418426
initial_gbnf_tool_grammar, verbose=llama.verbose
@@ -421,7 +429,15 @@ def chatml_function_calling_with_streaming(
421429
),
422430
)
423431
text = completion["choices"][0]["text"]
424-
tool_name = None if text.startswith("message") else text.split("\n")[-1][len("functions.") :]
432+
if "</think>\n\n" in text:
433+
think, text = text.split("</think>\n\n", maxsplit=1)
434+
prompt += think + "</think>\n\n"
435+
text = text.strip()
436+
tool_name = (
437+
None
438+
if text.startswith("message")
439+
else text.split("\n")[-1][len("functions.") :].rstrip(":")
440+
)
425441

426442
# Case 2 step 2A: Respond with a message
427443
if tool_name is None:
@@ -439,7 +455,8 @@ def chatml_function_calling_with_streaming(
439455

440456
# Case 2 step 2B: One or more function calls
441457
follow_up_gbnf_tool_grammar = (
442-
f'root ::= functions | "</function_calls>" | "<|im_end|>"\nfunctions ::= {function_names}\n'
458+
'root ::= functions | "</function_calls>" | "<|im_end|>" | "<|endoftext|>"\n'
459+
f"functions ::= {function_names}\n"
443460
)
444461
prompt += "<function_calls>\n"
445462
if stream:

src/raglite/_config.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@ class RAGLiteConfig:
3131
# LLM config used for generation.
3232
llm: str = field(
3333
default_factory=lambda: (
34-
"llama-cpp-python/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/*Q4_K_M.gguf@8192"
34+
"llama-cpp-python/unsloth/Qwen3-8B-GGUF/*Q4_K_M.gguf@8192"
3535
if llama_supports_gpu_offload()
36-
else "llama-cpp-python/bartowski/Llama-3.2-3B-Instruct-GGUF/*Q4_K_M.gguf@4096"
36+
else "llama-cpp-python/unsloth/Qwen3-4B-GGUF/*Q4_K_M.gguf@4096"
3737
)
3838
)
3939
llm_max_tries: int = 4

src/raglite/_litellm.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ class LlamaCppPythonLLM(CustomLLM):
5555
from litellm import completion
5656
5757
response = completion(
58-
model="llama-cpp-python/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/*Q4_K_M.gguf@4092",
58+
model="llama-cpp-python/unsloth/Qwen3-8B-GGUF/*Q4_K_M.gguf@8192",
5959
messages=[{"role": "user", "content": "Hello world!"}],
6060
# stream=True
6161
)
@@ -165,6 +165,17 @@ def _translate_openai_params(self, optional_params: dict[str, Any]) -> dict[str,
165165
}
166166
return llama_cpp_python_params
167167

168+
def _add_recommended_model_params(
169+
self, model: str, llama_cpp_python_params: dict[str, Any]
170+
) -> dict[str, Any]:
171+
"""Add recommended model settings."""
172+
recommended_settings = {}
173+
if "qwen3" in model.lower():
174+
# Add the official recommended 'thinking mode' settings [1].
175+
# [1] https://docs.unsloth.ai/basics/qwen3-how-to-run-and-fine-tune#official-recommended-settings
176+
recommended_settings = {"temperature": 0.6, "min_p": 0.0, "top_p": 0.95, "top_k": 20}
177+
return {**recommended_settings, **llama_cpp_python_params}
178+
168179
def completion( # noqa: PLR0913
169180
self,
170181
model: str,
@@ -186,6 +197,7 @@ def completion( # noqa: PLR0913
186197
) -> ModelResponse:
187198
llm = self.llm(model)
188199
llama_cpp_python_params = self._translate_openai_params(optional_params)
200+
llama_cpp_python_params = self._add_recommended_model_params(model, llama_cpp_python_params)
189201
response = cast(
190202
"llama_types.CreateChatCompletionResponse",
191203
llm.create_chat_completion(messages=messages, **llama_cpp_python_params),
@@ -219,6 +231,7 @@ def streaming( # noqa: PLR0913
219231
) -> Iterator[GenericStreamingChunk]:
220232
llm = self.llm(model)
221233
llama_cpp_python_params = self._translate_openai_params(optional_params)
234+
llama_cpp_python_params = self._add_recommended_model_params(model, llama_cpp_python_params)
222235
stream = cast(
223236
"Iterator[llama_types.CreateChatCompletionStreamResponse]",
224237
llm.create_chat_completion(messages=messages, **llama_cpp_python_params, stream=True),

tests/test_chatml_function_calling.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,8 @@ def is_accelerator_available() -> bool:
6161
[
6262
pytest.param("bartowski/Llama-3.2-3B-Instruct-GGUF", id="llama_3.2_3B"),
6363
pytest.param(
64-
"bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
65-
id="llama_3.1_8B",
64+
"unsloth/Qwen3-8B-GGUF",
65+
id="qwen3_8B",
6666
marks=pytest.mark.skipif(
6767
not is_accelerator_available(), reason="Accelerator not available"
6868
),

0 commit comments

Comments
 (0)