Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revisionPrevious revision
Next revision
Previous revision
712:configure_stages [2026/02/26 09:48] Prinz, Patrick712:configure_stages [2026/03/25 09:44] (current) – [AI Feature Tokenizer Configuration] Prinz, Patrick
Line 526: Line 526:
 </cg> </cg>
 </code> </code>
 +
 +**When using models newer than gpt4o (like gpt5)**
 +
 +>The temperature has to be adjusted to the default value of 1. Add <cg-property name="temperature" value="1" /> to the relevant <cg-hosts>
 +
 +>The <chatbot-property name="maxTokens" value="500"></chatbot-property> has to be replaced with <chatbot-property name="maxCompletionTokens" value="500"></chatbot-property>
  
 **AI content generation feature (OpenAI compatible):** **AI content generation feature (OpenAI compatible):**
Line 683: Line 689:
   <cg-property name="proxy" value="<proxy_ident>" />   <cg-property name="proxy" value="<proxy_ident>" />
 <cg-host/> <cg-host/>
 +</code>
 +
 +===== AI Feature Tokenizer Configuration =====
 +
 +LLM that are NOT using the standard byte-pair-encoding algorithm or are provided via the openAICompatible adapter CANNOT be used with the Azure tokenizer. Stages implements the HuggingFaceTokenizer from deep java library to support customized tokenizer.
 +
 +Add the property “tokenizerFolder” to the cg-host section
 +
 +   <cg-host ident="custom-model-ident" url="${ai.model.url}" displayName="custom-model">
 +    <cg-property name=".... />
 +    <cg-property name="tokenizerFolder" value="custom-model-v1"/>
 +  </cg-host>
 +
 + In methodpark\stages\conf
 +
 +Create a folder tokenizer\custom-model-v1 ← make sure the folder name exactly matches the value in the tokenizerFolder property
 +
 +Add tokenizer.json and tokenizer_config.json
 +
 +**Example for using DeepSeek V3.2**
 +
 +1. cg-host configuration in config.xml
 +
 +<code ->
 +<cg-type name="openAICompatible">
 + <cg-host ident="training-model" url="${ai.model.url}" displayName="DeepSeek V3.2 for Chat">
 +  <cg-property name="deployment_name" value="DeepSeek_V3_2"/>
 +  <cg-property name="tokenizerFolder" value="DeepSeek_V3_2"/>
 +  ...
 +</cg-host>
 +<cg-type />
 +</code>
 +
 +2. chatbot configuration in config.xml
 +
 +>You need to change all chatbot steps that shall use the configured LLM
 +
 +<code ->
 +default:
 +<chatbot-step ident="summaries" type="azureOpenAI" host="gpt-4o-mini">
 +
 +custom configuration:
 +<chatbot-step ident="summaries" type="openAICompatible" host="training-model">
 +</code>
 +
 +3. Add tokenizer configuration:
 +
 +As the custom models are not processed by the standard azure tokenizer, there has to be a custom configuration added and put into a folder in conf/tokenizer that matches the config.xml (e.g. conf/tokenizer/DeepSeek_V3_2). The custom tokenizer can use tokenizer.json and tokenizer_config.json files.
 +
 +For lots of models various configurations can be downloaded from Huggingface.co, e.g.:
 +
 +  * [[https://huggingface.co/deepseek-ai/DeepSeek-V3.2/blob/main/tokenizer.json]]
 +  * [[https://huggingface.co/deepseek-ai/DeepSeek-V3.2/blob/main/tokenizer_config.json]]
 +
 +To make the tokenizer_config.json work with Stages a few small adjustments have to be made
 +
 +>In tokenizer_config.json look up objects (e.g. bos_token) and just use the value from the “content” attribute value instead of the whole object. See the following file examples:
 +
 +**Original tokenizer_config.json :**
 +
 +<code ->
 +{
 +  "add_bos_token": true,
 +  "add_eos_token": false,
 +  "bos_token": {
 +    "__type": "AddedToken",
 +    "content": "<|begin▁of▁sentence|>",
 +    "lstrip": false,
 +    "normalized": true,
 +    "rstrip": false,
 +    "single_word": false
 +  },
 +  "clean_up_tokenization_spaces": false,
 +  "eos_token": {
 +    "__type": "AddedToken",
 +    "content": "<|end▁of▁sentence|>",
 +    "lstrip": false,
 +    "normalized": true,
 +    "rstrip": false,
 +    "single_word": false
 +  },
 +  "legacy": true,
 +  "model_max_length": 16384,
 +  "pad_token": {
 +    "__type": "AddedToken",
 +    "content": "<|end▁of▁sentence|>",
 +    "lstrip": false,
 +    "normalized": true,
 +    "rstrip": false,
 +    "single_word": false
 +  },
 +  "sp_model_kwargs": {},
 +  "unk_token": null,
 +  "tokenizer_class": "LlamaTokenizerFast",
 +  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' in message %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- else %}{{'<|Assistant|>' + message['content'] + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- endfor %}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|><think>\\n'}}{% endif %}"
 +}
 +</code>
 +
 +**Updated tokenizer_config.json**
 +
 +<code ->
 +{
 +  "add_bos_token": true,
 +  "add_eos_token": false,
 +  "bos_token": "<|begin▁of▁sentence|>",
 +  "clean_up_tokenization_spaces": false,
 +  "eos_token": "<|end▁of▁sentence|>",
 +  "legacy": true,
 +  "model_max_length": 16384,
 +  "pad_token": "<|end▁of▁sentence|>",
 +  "unk_token": null,
 +  "tokenizer_class": "LlamaTokenizerFast",
 +  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' in message %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- else %}{{'<|Assistant|>' + message['content'] + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- endfor %}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|><think>\\n'}}{% endif %}"
 +}
 </code> </code>