| Both sides previous revisionPrevious revisionNext revision | Previous revision |
| 712:configure_stages [2026/02/26 09:48] – Prinz, Patrick | 712:configure_stages [2026/03/25 09:44] (current) – [AI Feature Tokenizer Configuration] Prinz, Patrick |
|---|
| </cg> | </cg> |
| </code> | </code> |
| | |
| | **When using models newer than gpt4o (like gpt5)** |
| | |
| | >The temperature has to be adjusted to the default value of 1. Add <cg-property name="temperature" value="1" /> to the relevant <cg-hosts> |
| | |
| | >The <chatbot-property name="maxTokens" value="500"></chatbot-property> has to be replaced with <chatbot-property name="maxCompletionTokens" value="500"></chatbot-property> |
| |
| **AI content generation feature (OpenAI compatible):** | **AI content generation feature (OpenAI compatible):** |
| <cg-property name="proxy" value="<proxy_ident>" /> | <cg-property name="proxy" value="<proxy_ident>" /> |
| <cg-host/> | <cg-host/> |
| | </code> |
| | |
| | ===== AI Feature Tokenizer Configuration ===== |
| | |
| | LLM that are NOT using the standard byte-pair-encoding algorithm or are provided via the openAICompatible adapter CANNOT be used with the Azure tokenizer. Stages implements the HuggingFaceTokenizer from deep java library to support customized tokenizer. |
| | |
| | Add the property “tokenizerFolder” to the cg-host section |
| | |
| | <cg-host ident="custom-model-ident" url="${ai.model.url}" displayName="custom-model"> |
| | <cg-property name=".... /> |
| | <cg-property name="tokenizerFolder" value="custom-model-v1"/> |
| | </cg-host> |
| | |
| | In methodpark\stages\conf |
| | |
| | Create a folder tokenizer\custom-model-v1 ← make sure the folder name exactly matches the value in the tokenizerFolder property |
| | |
| | Add tokenizer.json and tokenizer_config.json |
| | |
| | **Example for using DeepSeek V3.2** |
| | |
| | 1. cg-host configuration in config.xml |
| | |
| | <code -> |
| | <cg-type name="openAICompatible"> |
| | <cg-host ident="training-model" url="${ai.model.url}" displayName="DeepSeek V3.2 for Chat"> |
| | <cg-property name="deployment_name" value="DeepSeek_V3_2"/> |
| | <cg-property name="tokenizerFolder" value="DeepSeek_V3_2"/> |
| | ... |
| | </cg-host> |
| | <cg-type /> |
| | </code> |
| | |
| | 2. chatbot configuration in config.xml |
| | |
| | >You need to change all chatbot steps that shall use the configured LLM |
| | |
| | <code -> |
| | default: |
| | <chatbot-step ident="summaries" type="azureOpenAI" host="gpt-4o-mini"> |
| | |
| | custom configuration: |
| | <chatbot-step ident="summaries" type="openAICompatible" host="training-model"> |
| | </code> |
| | |
| | 3. Add tokenizer configuration: |
| | |
| | As the custom models are not processed by the standard azure tokenizer, there has to be a custom configuration added and put into a folder in conf/tokenizer that matches the config.xml (e.g. conf/tokenizer/DeepSeek_V3_2). The custom tokenizer can use tokenizer.json and tokenizer_config.json files. |
| | |
| | For lots of models various configurations can be downloaded from Huggingface.co, e.g.: |
| | |
| | * [[https://huggingface.co/deepseek-ai/DeepSeek-V3.2/blob/main/tokenizer.json]] |
| | * [[https://huggingface.co/deepseek-ai/DeepSeek-V3.2/blob/main/tokenizer_config.json]] |
| | |
| | To make the tokenizer_config.json work with Stages a few small adjustments have to be made |
| | |
| | >In tokenizer_config.json look up objects (e.g. bos_token) and just use the value from the “content” attribute value instead of the whole object. See the following file examples: |
| | |
| | **Original tokenizer_config.json :** |
| | |
| | <code -> |
| | { |
| | "add_bos_token": true, |
| | "add_eos_token": false, |
| | "bos_token": { |
| | "__type": "AddedToken", |
| | "content": "<|begin▁of▁sentence|>", |
| | "lstrip": false, |
| | "normalized": true, |
| | "rstrip": false, |
| | "single_word": false |
| | }, |
| | "clean_up_tokenization_spaces": false, |
| | "eos_token": { |
| | "__type": "AddedToken", |
| | "content": "<|end▁of▁sentence|>", |
| | "lstrip": false, |
| | "normalized": true, |
| | "rstrip": false, |
| | "single_word": false |
| | }, |
| | "legacy": true, |
| | "model_max_length": 16384, |
| | "pad_token": { |
| | "__type": "AddedToken", |
| | "content": "<|end▁of▁sentence|>", |
| | "lstrip": false, |
| | "normalized": true, |
| | "rstrip": false, |
| | "single_word": false |
| | }, |
| | "sp_model_kwargs": {}, |
| | "unk_token": null, |
| | "tokenizer_class": "LlamaTokenizerFast", |
| | "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' in message %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- else %}{{'<|Assistant|>' + message['content'] + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- endfor %}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|><think>\\n'}}{% endif %}" |
| | } |
| | </code> |
| | |
| | **Updated tokenizer_config.json** |
| | |
| | <code -> |
| | { |
| | "add_bos_token": true, |
| | "add_eos_token": false, |
| | "bos_token": "<|begin▁of▁sentence|>", |
| | "clean_up_tokenization_spaces": false, |
| | "eos_token": "<|end▁of▁sentence|>", |
| | "legacy": true, |
| | "model_max_length": 16384, |
| | "pad_token": "<|end▁of▁sentence|>", |
| | "unk_token": null, |
| | "tokenizer_class": "LlamaTokenizerFast", |
| | "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' in message %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- else %}{{'<|Assistant|>' + message['content'] + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- endfor %}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|><think>\\n'}}{% endif %}" |
| | } |
| </code> | </code> |
| |
| |