perf: ⚡️ Remove langchain for model calling

gmpetrov · gmpetrov · commit 73dd1dd8ded6 · 2023-11-28T21:53:08.000+01:00
diff --git a/apps/dashboard/pages/api/agents/[id]/query.tsx b/apps/dashboard/pages/api/agents/[id]/query.tsx
@@ -128,7 +128,7 @@ export const chatAgentRequest = async (
     agent.modelName = data.modelName;
   }
 
-  const manager = new AgentManager({ agent, topK: 5 });
+  const manager = new AgentManager({ agent, topK: 50 });
   const ctrl = new AbortController();
 
   if (data.streaming) {
diff --git a/packages/lib/agent.ts b/packages/lib/agent.ts
@@ -1,6 +1,7 @@
 import axios from 'axios';
 import { ChatOpenAI } from 'langchain/chat_models/openai';
 import { AIMessage, HumanMessage, SystemMessage } from 'langchain/schema';
+import { ChatCompletionMessageParam } from 'openai/resources';
 
 import {
   Agent,
@@ -284,19 +285,30 @@ export default class AgentManager {
     const _promptType = promptType || this.agent.promptType;
     const _promptTemplate = promptTemplate || (this.agent.prompt as string);
 
-    let initialMessages: any = [];
+    let initialMessages: ChatCompletionMessageParam[] = [];
     if (_promptType === PromptType.customer_support) {
       initialMessages = [
-        new SystemMessage(
-          `${_promptTemplate}
-Answer the query in the same language in which the query is asked.
-Give answer in the markdown rich format with proper bolds, italics etc as per heirarchy and readability requirements.
-You will be provided by a context retrieved by the knowledge_base_retrieval function.
-If the context does not contain the information needed to answer this query then politely say that you don't know without mentioning the existence of a context.
-Remember do not answer any query that is outside of the provided context nor mention its existence.
-You are allowed to use the following conversation history to answer the query.
-`
-        ),
+        {
+          role: 'system',
+          content: `${_promptTemplate}
+          Answer the query in the same language in which the query is asked.
+          Give answer in the markdown rich format with proper bolds, italics etc as per heirarchy and readability requirements.
+          You will be provided by a context retrieved by the knowledge_base_retrieval function.
+          If the context does not contain the information needed to answer this query then politely say that you don't know without mentioning the existence of a context.
+          Remember do not answer any query that is outside of the provided context nor mention its existence.
+          You are allowed to use the following conversation history to answer the query.
+          `,
+        },
+        //         new SystemMessage(
+        //           `${_promptTemplate}
+        // Answer the query in the same language in which the query is asked.
+        // Give answer in the markdown rich format with proper bolds, italics etc as per heirarchy and readability requirements.
+        // You will be provided by a context retrieved by the knowledge_base_retrieval function.
+        // If the context does not contain the information needed to answer this query then politely say that you don't know without mentioning the existence of a context.
+        // Remember do not answer any query that is outside of the provided context nor mention its existence.
+        // You are allowed to use the following conversation history to answer the query.
+        // `
+        //         ),
         // new HumanMessage(`${_promptTemplate}
         // Answer the message in the same language in which the message is asked.
         // If you don't find an answer from the chunks, politely say that you don't know without mentioning the existence of a context. Don't try to make up an answer.
diff --git a/packages/lib/chains/chat-retrieval.ts b/packages/lib/chains/chat-retrieval.ts
@@ -7,8 +7,10 @@ import { ChatRequest } from '@chaindesk/lib/types/dtos';
 import { Datastore, MessageFrom } from '@chaindesk/prisma';
 
 import chat, { ChatProps } from '../chatv2';
+import { ModelConfig } from '../config';
 import createPromptContext from '../create-prompt-context';
 import retrieval from '../retrieval';
+import truncateArray from '../truncateArray';
 
 export type ChatRetrievalChainProps = Omit<ChatProps, 'prompt'> & {
   datastore?: Datastore;
@@ -39,7 +41,7 @@ const chatRetrieval = async ({
   abortController,
   ...otherProps
 }: ChatRetrievalChainProps) => {
-  const results = retrievalSearch
+  const _results = retrievalSearch
     ? await retrieval({
         datastore,
         filters,
@@ -48,6 +50,18 @@ const chatRetrieval = async ({
       })
     : [];
 
+  const results = await truncateArray<AppDocument<ChunkMetadataRetrieved>>({
+    items: _results,
+    getText: (item) => item.pageContent,
+    setText: (item, text) => {
+      return {
+        ...item,
+        pageContent: text,
+      };
+    },
+    maxTokens: ModelConfig?.[modelName!]?.maxTokens * 0.2,
+  });
+
   const prompt = getPrompt(results);
 
   // Generate answer
diff --git a/packages/lib/chains/index.ts b/packages/lib/chains/index.ts
@@ -100,9 +100,13 @@ export default class ChainManager {
 
     return chat({
       initialMessages: [
-        new SystemMessage(
-          `You are a productivity assistant. Please provide a helpful and professional response to the user's question or issue.`
-        ),
+        {
+          role: 'system',
+          content: `You are a productivity assistant. Please provide a helpful and professional response to the user's question or issue.`,
+        },
+        // new SystemMessage(
+        //   `You are a productivity assistant. Please provide a helpful and professional response to the user's question or issue.`
+        // ),
       ],
       prompt: input,
       temperature: temperature || 0.5,
diff --git a/packages/lib/chains/qa.ts b/packages/lib/chains/qa.ts
@@ -31,7 +31,7 @@ const qa = async ({
   abortController,
 }: QAChainProps) => {
   return chatRetrieval({
-    modelName: 'gpt_3_5_turbo_16k',
+    modelName: 'gpt_3_5_turbo',
     retrievalSearch: query,
     getPrompt(chunks) {
       return promptInject({
diff --git a/packages/lib/chat-model.ts b/packages/lib/chat-model.ts
@@ -0,0 +1,82 @@
+import OpenAI, { ClientOptions } from 'openai';
+import { ChatCompletionMessageParam, CompletionUsage } from 'openai/resources';
+import pRetry from 'p-retry';
+
+import failedAttemptHandler from './lc-failed-attempt-hanlder';
+
+export default class ChatModel {
+  public openai: OpenAI;
+
+  constructor(options: ClientOptions) {
+    this.openai = new OpenAI({
+      ...options,
+    });
+  }
+
+  static countTokensMessages(messages: ChatCompletionMessageParam[]) {
+    let counter = 0;
+
+    for (const each of messages) {
+      counter += each?.content?.length || 0;
+    }
+
+    return counter / 4;
+  }
+
+  async call({
+    handleStream,
+    signal,
+    ...otherProps
+  }: Parameters<typeof this.openai.chat.completions.create>[0] & {
+    handleStream?: (text: string) => any;
+    signal?: AbortSignal;
+  }) {
+    return pRetry(
+      async () => {
+        if (!!handleStream) {
+          let usage: CompletionUsage = {
+            completion_tokens: 0,
+            prompt_tokens: ChatModel.countTokensMessages(otherProps?.messages),
+            total_tokens: 0,
+          };
+
+          const streaming = await this.openai.chat.completions.create({
+            ...otherProps,
+            stream: true,
+          });
+
+          let buffer = '';
+          for await (const chunk of streaming) {
+            const content = chunk.choices[0]?.delta?.content || '';
+
+            handleStream?.(content);
+            buffer += content;
+            usage.completion_tokens += 1;
+          }
+
+          usage.total_tokens = usage.prompt_tokens + usage.completion_tokens;
+
+          return {
+            answer: buffer?.trim?.(),
+            usage,
+          };
+        } else {
+          const response = await this.openai.chat.completions.create({
+            ...otherProps,
+            stream: false,
+          });
+
+          return {
+            answer: response?.choices?.[0]?.message?.content?.trim?.(),
+            usage: response?.usage,
+          };
+        }
+      },
+      {
+        signal,
+        retries: 6,
+        onFailedAttempt: failedAttemptHandler,
+      }
+    );
+  }
+}
diff --git a/packages/lib/chatv2.ts b/packages/lib/chatv2.ts
@@ -1,17 +1,11 @@
-import { ChatOpenAI } from 'langchain/chat_models/openai';
-import {
-  AIMessage,
-  BaseMessage,
-  FunctionMessage,
-  HumanMessage,
-  MessageContent,
-} from 'langchain/schema';
+import { ChatCompletionMessageParam } from 'openai/resources';
 
 import { AgentModelName, Message, MessageFrom } from '@chaindesk/prisma';
 
 import { ChatModelConfigSchema, ChatResponse } from './types/dtos';
+import ChatModel from './chat-model';
 import { ModelConfig } from './config';
-import failedAttemptHandler from './lc-failed-attempt-hanlder';
+import formatMessagesOpenAI from './format-messages-openai';
 import truncateChatMessages from './truncateChatMessages';
 
 export type ChatProps = ChatModelConfigSchema & {
@@ -20,7 +14,7 @@ export type ChatProps = ChatModelConfigSchema & {
   modelName?: AgentModelName;
   history?: Message[];
   abortController?: any;
-  initialMessages?: BaseMessage[] | undefined;
+  initialMessages?: ChatCompletionMessageParam[] | undefined;
   context?: string;
   useXpContext?: boolean;
 };
@@ -37,95 +31,55 @@ const chat = async ({
   useXpContext,
   ...otherProps
 }: ChatProps) => {
-  let totalCompletionTokens = 0;
-  let totalPromptTokens = 0;
-  let totalExecutionTokens = 0;
-
-  const model = new ChatOpenAI({
-    streaming: Boolean(stream),
-    modelName: ModelConfig[modelName]?.name,
-
-    temperature: temperature || 0,
-    topP: otherProps.topP,
-    frequencyPenalty: otherProps.frequencyPenalty,
-    presencePenalty: otherProps.presencePenalty,
-    maxTokens: otherProps.maxTokens,
-    onFailedAttempt: failedAttemptHandler,
-    callbacks: [
-      {
-        handleLLMNewToken: stream,
-        handleLLMEnd: (output, runId, parentRunId?, tags?) => {
-          const { completionTokens, promptTokens, totalTokens } =
-            output.llmOutput?.tokenUsage ||
-            output.llmOutput?.estimatedTokenUsage;
-          totalCompletionTokens += completionTokens ?? 0;
-          totalPromptTokens += promptTokens ?? 0;
-          totalExecutionTokens += totalTokens ?? 0;
-        },
-        handleLLMError: async (err: Error) => {
-          console.error('handleLLMError', err);
-        },
-      },
-    ],
-  });
-
-  if (process.env.APP_ENV === 'test') {
-    model.call = async (props: any) => {
-      const res = {
-        text: 'Hello world',
-      } as any;
-
-      if (stream) {
-        stream(res.text);
-      }
-
-      return res;
-    };
-  }
-
   const truncatedHistory = (
     await truncateChatMessages({
-      messages: (history || [])
-        ?.map((each) => {
-          if (each.from === MessageFrom.human) {
-            return new HumanMessage(each.text);
-          }
-          return new AIMessage(each.text);
-        })
-        .reverse(),
+      messages: formatMessagesOpenAI(history || []).reverse(),
       maxTokens: ModelConfig[modelName]?.maxTokens * 0.3, // 30% tokens limit for history
     })
   ).reverse();
 
-  const messages = [
+  const messages: ChatCompletionMessageParam[] = [
     ...initialMessages,
     ...truncatedHistory,
-    ...(useXpContext && context
+    ...((useXpContext && context
       ? [
-          new FunctionMessage({
-            content: context,
+          {
+            role: 'function',
+            content: context!,
             name: 'knowledge_base_retrieval',
-          }),
+          },
         ]
-      : []),
-    new HumanMessage(prompt),
+      : []) as ChatCompletionMessageParam[]),
+    { role: 'user', content: prompt },
   ];
 
   // console.log('messages ===--------c_>', messages);
 
-  const output = await model.call(messages, {
+  const model = new ChatModel({});
+
+  const output = await model.call({
+    handleStream: stream,
+    model: ModelConfig[modelName]?.name,
+    messages,
+
+    temperature: temperature || 0,
+    top_p: otherProps.topP,
+    frequency_penalty: otherProps.frequencyPenalty,
+    presence_penalty: otherProps.presencePenalty,
+    max_tokens: otherProps.maxTokens,
     signal: abortController?.signal,
   });
 
-  const answer = (output?.content as string)?.trim?.();
+  const answer = output?.answer;
 
   const usage = {
-    completionTokens: totalCompletionTokens,
-    promptTokens: totalPromptTokens,
-    totalTokens: totalExecutionTokens,
+    completionTokens: output?.usage?.completion_tokens,
+    promptTokens: output?.usage?.prompt_tokens,
+    totalTokens: output?.usage?.total_tokens,
     cost:
-      totalPromptTokens * ModelConfig[modelName]?.providerPriceByInputToken +
-      totalCompletionTokens *
+      (output?.usage?.prompt_tokens || 0) *
+        ModelConfig[modelName]?.providerPriceByInputToken +
+      (output?.usage?.completion_tokens || 0) *
         ModelConfig[modelName]?.providerPricePriceByOutputToken,
   };
 
diff --git a/packages/lib/config.ts b/packages/lib/config.ts
@@ -16,7 +16,7 @@ export const XPBNPLabels = {
 export const ModelConfig = {
   [AgentModelName.gpt_3_5_turbo]: {
     name: 'gpt-3.5-turbo-1106',
-    maxTokens: 4096,
+    maxTokens: 16385,
     cost: 1,
     providerPriceByInputToken: 0.000001,
     providerPricePriceByOutputToken: 0.000002,
diff --git a/packages/lib/count-tokens.ts b/packages/lib/count-tokens.ts
@@ -14,4 +14,8 @@ const countTokens = ({
   return nbTokens;
 };
 
+export const countTokensEstimation = ({ text }: { text: string }) => {
+  return text?.length / 4;
+};
+
 export default countTokens;
diff --git a/packages/lib/format-messages-openai.ts b/packages/lib/format-messages-openai.ts
@@ -0,0 +1,20 @@
+import { ChatCompletionMessageParam } from 'openai/resources';
+
+import { Message } from '@chaindesk/prisma';
+
+const formatMessagesOpenAI = (messages: Message[]) => {
+  return messages.map((each) => {
+    let role = 'user' as ChatCompletionMessageParam['role'];
+
+    if (each.from === 'agent') {
+      role = 'assistant';
+    }
+
+    return {
+      role,
+      content: each.text,
+    } as ChatCompletionMessageParam;
+  });
+};
+
+export default formatMessagesOpenAI;
diff --git a/packages/lib/package.json b/packages/lib/package.json
diff --git a/packages/lib/truncateArray.ts b/packages/lib/truncateArray.ts
diff --git a/packages/lib/truncateChatMessages.ts b/packages/lib/truncateChatMessages.ts
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml

Original file line number	Diff line number	Diff line change
`@@ -128,7 +128,7 @@ export const chatAgentRequest = async (`
`128`	`128`	`agent.modelName = data.modelName;`
`129`	`129`	`}`
`130`	`130`
`131`		`- const manager = new AgentManager({ agent, topK: 5 });`
	`131`	`+ const manager = new AgentManager({ agent, topK: 50 });`
`132`	`132`	`const ctrl = new AbortController();`
`133`	`133`
`134`	`134`	`if (data.streaming) {`