feat: add support for reasoning tool use and upgrade to Qwen3 (#124)

lsorber · web-flow · commit 7353453f570b · 2025-05-12T19:19:16.000+02:00
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -9,7 +9,8 @@ on:
 
 jobs:
   test:
-    runs-on: ubuntu-latest
+    runs-on:
+      group: raglite
 
     strategy:
       fail-fast: false
diff --git a/README.md b/README.md
@@ -44,7 +44,7 @@ RAGLite is a Python toolkit for Retrieval-Augmented Generation (RAG) with Postgr
 > 🚀 If you want to use local models, it is recommended to install [an accelerated llama-cpp-python precompiled binary](https://github.com/abetlen/llama-cpp-python?tab=readme-ov-file#supported-backends) with:
 > ```sh
 > # Configure which llama-cpp-python precompiled binary to install (⚠️ not every combination is available):
-> LLAMA_CPP_PYTHON_VERSION=0.3.4
+> LLAMA_CPP_PYTHON_VERSION=0.3.9
 > PYTHON_VERSION=310|311|312
 > ACCELERATOR=metal|cu121|cu122|cu123|cu124
 > PLATFORM=macosx_11_0_arm64|linux_x86_64|win_amd64
@@ -92,7 +92,7 @@ pip install raglite[ragas]
 ### 1. Configuring RAGLite
 
 > [!TIP]
-> 🧠 RAGLite extends [LiteLLM](https://github.com/BerriAI/litellm) with support for [llama.cpp](https://github.com/ggerganov/llama.cpp) models using [llama-cpp-python](https://github.com/abetlen/llama-cpp-python). To select a llama.cpp model (e.g., from [bartowski's collection](https://huggingface.co/bartowski)), use a model identifier of the form `"llama-cpp-python/<hugging_face_repo_id>/<filename>@<n_ctx>"`, where `n_ctx` is an optional parameter that specifies the context size of the model.
+> 🧠 RAGLite extends [LiteLLM](https://github.com/BerriAI/litellm) with support for [llama.cpp](https://github.com/ggerganov/llama.cpp) models using [llama-cpp-python](https://github.com/abetlen/llama-cpp-python). To select a llama.cpp model (e.g., from [Unsloth's collection](https://huggingface.co/unsloth)), use a model identifier of the form `"llama-cpp-python/<hugging_face_repo_id>/<filename>@<n_ctx>"`, where `n_ctx` is an optional parameter that specifies the context size of the model.
 
 > [!TIP]
 > 💾 You can create a PostgreSQL database in a few clicks at [neon.tech](https://neon.tech).
@@ -112,7 +112,7 @@ my_config = RAGLiteConfig(
 # Example 'local' config with a SQLite database and a llama.cpp LLM:
 my_config = RAGLiteConfig(
     db_url="sqlite:///raglite.db",
-    llm="llama-cpp-python/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/*Q4_K_M.gguf@8192",
+    llm="llama-cpp-python/unsloth/Qwen3-8B-GGUF/*Q4_K_M.gguf@8192",
     embedder="llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf@1024",  # A context size of 1024 tokens is the sweet spot for bge-m3
 )
 ```
@@ -308,7 +308,7 @@ RAGLite comes with an [MCP server](https://modelcontextprotocol.io) implemented
 ```
 raglite \
     --db-url sqlite:///raglite.db \
-    --llm llama-cpp-python/bartowski/Llama-3.2-3B-Instruct-GGUF/*Q4_K_M.gguf@4096 \
+    --llm llama-cpp-python/unsloth/Qwen3-4B-GGUF/*Q4_K_M.gguf@4096 \
     --embedder llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf@1024 \
     mcp install
 ```
@@ -344,7 +344,7 @@ You can specify the database URL, LLM, and embedder directly in the Chainlit fro
 ```sh
 raglite \
     --db-url sqlite:///raglite.db \
-    --llm llama-cpp-python/bartowski/Llama-3.2-3B-Instruct-GGUF/*Q4_K_M.gguf@4096 \
+    --llm llama-cpp-python/unsloth/Qwen3-4B-GGUF/*Q4_K_M.gguf@4096 \
     --embedder llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf@1024 \
     chainlit
 ```
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,7 +25,7 @@ dependencies = [
   "scipy (>=1.11.2,!=1.15.0.*,!=1.15.1,!=1.15.2)",
   "wtpsplit-lite (>=0.1.0)",
   # Large Language Models:
-  "huggingface-hub (>=0.22.0)",
+  "huggingface-hub[hf_xet] (>=0.30.0)",
   "litellm (>=1.60.2)",
   "pydantic (>=2.7.0)",
   # Approximate Nearest Neighbors:
@@ -82,7 +82,7 @@ dev = [
 # Frontend:
 chainlit = ["chainlit (>=2.0.0)"]
 # Large Language Models:
-llama-cpp-python = ["llama-cpp-python (>=0.3.4)"]
+llama-cpp-python = ["llama-cpp-python (>=0.3.9)"]
 # Markdown conversion:
 pandoc = ["pypandoc-binary (>=1.13)"]
 # Evaluation:
diff --git a/src/raglite/_chatml_function_calling.py b/src/raglite/_chatml_function_calling.py
@@ -6,6 +6,9 @@
     b. ✨ Add function descriptions to the system message so that tool use is better informed (fixes https://github.com/abetlen/llama-cpp-python/issues/1869).
     c. ✨ Replace `print` statements relating to JSON grammars with `RuntimeWarning` warnings.
     d. ✅ Add tests with fairly broad coverage of the different scenarios.
+    e. 🐛 Fix a 'content' KeyError in the prompt template.
+    f. ✨ Add support for Qwen3's <|endoftext|> separator.
+    g. ✨ Add support for Qwen3's <think>...</think> mode to (auto and fixed) function calling.
 4. Case "Tool choice by user":
     a. ✨ Add support for more than one function call by making this a special case of "Automatic tool choice" with a single tool (subsumes https://github.com/abetlen/llama-cpp-python/pull/1503).
 5. Case "Automatic tool choice -> respond with a message":
@@ -326,12 +329,13 @@ def chatml_function_calling_with_streaming(
 
     # Collect the llama.create_completion keyword arguments so we don't have to repeat these with
     # each completion call
+    default_stop = ["<|im_end|>", "<|endoftext|>"]
     stop = (
-        [stop, "<|im_end|>"]
+        [stop, *default_stop]
         if isinstance(stop, str)
-        else [*stop, "<|im_end|>"]
+        else [*stop, *default_stop]
         if stop
-        else ["<|im_end|>"]
+        else default_stop
     )
     grammar = (  # It is assumed the grammar applies to messages only, not tool calls
         grammar
@@ -398,11 +402,16 @@ def chatml_function_calling_with_streaming(
     )
     initial_gbnf_tool_grammar = (
         (
-            'root ::= "<function_calls>" "\\n" functions | "message:"\n'
+            'root ::= think? ("<function_calls>" "\\n" functions | "message:")\n'
             f"functions ::= {function_names}\n"
+            'think ::= "<think>" [^<]* "</think>" "\\n\\n"\n'
         )
         if tool_choice == "auto"
-        else f'root ::= "<function_calls>" "\\n" functions\nfunctions ::= {function_names}\n'
+        else (
+            f'root ::= think? "<function_calls>" "\\n" functions\n'
+            f"functions ::= {function_names}\n"
+            'think ::= "<think>" [^<]* "</think>" "\\n\\n"\n'
+        )
     )
     completion = cast(
         "llama_types.CreateCompletionResponse",
@@ -412,7 +421,6 @@ def chatml_function_calling_with_streaming(
                 **completion_kwargs,
                 "temperature": 0,
                 "stream": False,
-                "stop": [":"],
                 "max_tokens": None,
                 "grammar": llama_grammar.LlamaGrammar.from_string(
                     initial_gbnf_tool_grammar, verbose=llama.verbose
@@ -421,7 +429,15 @@ def chatml_function_calling_with_streaming(
         ),
     )
     text = completion["choices"][0]["text"]
-    tool_name = None if text.startswith("message") else text.split("\n")[-1][len("functions.") :]
+    if "</think>\n\n" in text:
+        think, text = text.split("</think>\n\n", maxsplit=1)
+        prompt += think + "</think>\n\n"
+    text = text.strip()
+    tool_name = (
+        None
+        if text.startswith("message")
+        else text.split("\n")[-1][len("functions.") :].rstrip(":")
+    )
 
     # Case 2 step 2A: Respond with a message
     if tool_name is None:
@@ -439,7 +455,8 @@ def chatml_function_calling_with_streaming(
 
     # Case 2 step 2B: One or more function calls
     follow_up_gbnf_tool_grammar = (
-        f'root ::= functions | "</function_calls>" | "<|im_end|>"\nfunctions ::= {function_names}\n'
+        'root ::= functions | "</function_calls>" | "<|im_end|>" | "<|endoftext|>"\n'
+        f"functions ::= {function_names}\n"
     )
     prompt += "<function_calls>\n"
     if stream:
diff --git a/src/raglite/_config.py b/src/raglite/_config.py
@@ -31,9 +31,9 @@ class RAGLiteConfig:
     # LLM config used for generation.
     llm: str = field(
         default_factory=lambda: (
-            "llama-cpp-python/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/*Q4_K_M.gguf@8192"
+            "llama-cpp-python/unsloth/Qwen3-8B-GGUF/*Q4_K_M.gguf@8192"
             if llama_supports_gpu_offload()
-            else "llama-cpp-python/bartowski/Llama-3.2-3B-Instruct-GGUF/*Q4_K_M.gguf@4096"
+            else "llama-cpp-python/unsloth/Qwen3-4B-GGUF/*Q4_K_M.gguf@4096"
         )
     )
     llm_max_tries: int = 4
diff --git a/src/raglite/_litellm.py b/src/raglite/_litellm.py
@@ -55,7 +55,7 @@ class LlamaCppPythonLLM(CustomLLM):
     from litellm import completion
 
     response = completion(
-        model="llama-cpp-python/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/*Q4_K_M.gguf@4092",
+        model="llama-cpp-python/unsloth/Qwen3-8B-GGUF/*Q4_K_M.gguf@8192",
         messages=[{"role": "user", "content": "Hello world!"}],
         # stream=True
     )
@@ -165,6 +165,17 @@ def _translate_openai_params(self, optional_params: dict[str, Any]) -> dict[str,
             }
         return llama_cpp_python_params
 
+    def _add_recommended_model_params(
+        self, model: str, llama_cpp_python_params: dict[str, Any]
+    ) -> dict[str, Any]:
+        """Add recommended model settings."""
+        recommended_settings = {}
+        if "qwen3" in model.lower():
+            # Add the official recommended 'thinking mode' settings [1].
+            # [1] https://docs.unsloth.ai/basics/qwen3-how-to-run-and-fine-tune#official-recommended-settings
+            recommended_settings = {"temperature": 0.6, "min_p": 0.0, "top_p": 0.95, "top_k": 20}
+        return {**recommended_settings, **llama_cpp_python_params}
+
     def completion(  # noqa: PLR0913
         self,
         model: str,
@@ -186,6 +197,7 @@ def completion(  # noqa: PLR0913
     ) -> ModelResponse:
         llm = self.llm(model)
         llama_cpp_python_params = self._translate_openai_params(optional_params)
+        llama_cpp_python_params = self._add_recommended_model_params(model, llama_cpp_python_params)
         response = cast(
             "llama_types.CreateChatCompletionResponse",
             llm.create_chat_completion(messages=messages, **llama_cpp_python_params),
@@ -219,6 +231,7 @@ def streaming(  # noqa: PLR0913
     ) -> Iterator[GenericStreamingChunk]:
         llm = self.llm(model)
         llama_cpp_python_params = self._translate_openai_params(optional_params)
+        llama_cpp_python_params = self._add_recommended_model_params(model, llama_cpp_python_params)
         stream = cast(
             "Iterator[llama_types.CreateChatCompletionStreamResponse]",
             llm.create_chat_completion(messages=messages, **llama_cpp_python_params, stream=True),
diff --git a/tests/test_chatml_function_calling.py b/tests/test_chatml_function_calling.py
@@ -61,8 +61,8 @@ def is_accelerator_available() -> bool:
     [
         pytest.param("bartowski/Llama-3.2-3B-Instruct-GGUF", id="llama_3.2_3B"),
         pytest.param(
-            "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
-            id="llama_3.1_8B",
+            "unsloth/Qwen3-8B-GGUF",
+            id="qwen3_8B",
             marks=pytest.mark.skipif(
                 not is_accelerator_available(), reason="Accelerator not available"
             ),

Original file line number	Diff line number	Diff line change
`@@ -31,9 +31,9 @@ class RAGLiteConfig:`
`31`	`31`	`# LLM config used for generation.`
`32`	`32`	`llm: str = field(`
`33`	`33`	`default_factory=lambda: (`
`34`		`- "llama-cpp-python/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/*Q4_K_M.gguf@8192"`
	`34`	`+ "llama-cpp-python/unsloth/Qwen3-8B-GGUF/*Q4_K_M.gguf@8192"`
`35`	`35`	`if llama_supports_gpu_offload()`
`36`		`- else "llama-cpp-python/bartowski/Llama-3.2-3B-Instruct-GGUF/*Q4_K_M.gguf@4096"`
	`36`	`+ else "llama-cpp-python/unsloth/Qwen3-4B-GGUF/*Q4_K_M.gguf@4096"`
`37`	`37`	`)`
`38`	`38`	`)`
`39`	`39`	`llm_max_tries: int = 4`