Source code for oumi.core.tokenizers.special_tokens
# Copyright 2025 - Oumi
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional
from transformers import SpecialTokensMixin
from oumi.core.tokenizers import BaseTokenizer
from oumi.utils.logging import logger
# Llama 3.1/3.2 models already have `<|finetune_right_pad_id|>` token in their vocab.
LLAMA_SPECIAL_TOKENS_MIXIN = SpecialTokensMixin(pad_token="<|finetune_right_pad_id|>")
special_tokens = {
"meta-llama/Llama-3.1-8B": LLAMA_SPECIAL_TOKENS_MIXIN,
"meta-llama/Llama-3.1-8B-Instruct": LLAMA_SPECIAL_TOKENS_MIXIN,
"meta-llama/Llama-3.1-70B": LLAMA_SPECIAL_TOKENS_MIXIN,
"meta-llama/Llama-3.1-70B-Instruct": LLAMA_SPECIAL_TOKENS_MIXIN,
"meta-llama/Llama-3.1-405B": LLAMA_SPECIAL_TOKENS_MIXIN,
"meta-llama/Llama-3.1-405B-Instruct": LLAMA_SPECIAL_TOKENS_MIXIN,
"meta-llama/Llama-3.1-405B-FP8": LLAMA_SPECIAL_TOKENS_MIXIN,
"meta-llama/Llama-3.1-405B-Instruct-FP8": LLAMA_SPECIAL_TOKENS_MIXIN,
"meta-llama/Meta-Llama-3.1-8B": LLAMA_SPECIAL_TOKENS_MIXIN,
"meta-llama/Meta-Llama-3.1-8B-Instruct": LLAMA_SPECIAL_TOKENS_MIXIN,
"meta-llama/Meta-Llama-3.1-70B": LLAMA_SPECIAL_TOKENS_MIXIN,
"meta-llama/Meta-Llama-3.1-70B-Instruct": LLAMA_SPECIAL_TOKENS_MIXIN,
"meta-llama/Meta-Llama-3.1-405B": LLAMA_SPECIAL_TOKENS_MIXIN,
"meta-llama/Meta-Llama-3.1-405B-Instruct": LLAMA_SPECIAL_TOKENS_MIXIN,
"meta-llama/Meta-Llama-3.1-405B-FP8": LLAMA_SPECIAL_TOKENS_MIXIN,
"meta-llama/Meta-Llama-3.1-405B-Instruct-FP8": LLAMA_SPECIAL_TOKENS_MIXIN,
"meta-llama/Llama-3.2-1B": LLAMA_SPECIAL_TOKENS_MIXIN,
"meta-llama/Llama-3.2-1B-Instruct": LLAMA_SPECIAL_TOKENS_MIXIN,
"meta-llama/Llama-3.2-3B": LLAMA_SPECIAL_TOKENS_MIXIN,
"meta-llama/Llama-3.2-3B-Instruct": LLAMA_SPECIAL_TOKENS_MIXIN,
}
# Lowercase all keys for case-insensitive lookup.
special_tokens = {k.lower(): v for k, v in special_tokens.items()}
[docs]
def get_default_special_tokens(
tokenizer: Optional[BaseTokenizer],
) -> SpecialTokensMixin:
"""Returns the default special tokens for the tokenizer that was provided.
Args:
tokenizer: The tokenizer to get special tokens for.
Returns:
The special tokens mixin for the tokenizer.
Description:
This function looks up the special tokens for the provided tokenizer, for a list
of known models. If the tokenizer is not recognized, it returns an empty special
tokens mixin. This function is used as a fallback mechanism when a special token
is required, but is not provided in the tokenizer's configuration. The primary
use case for this is to retrieve the padding special token (`pad_token`), which
is oftentimes not included in the tokenizer's configuration, even if it exists
in the tokenizer's vocabulary.
"""
if tokenizer and tokenizer.name_or_path:
if tokenizer.name_or_path.lower() in special_tokens:
return special_tokens[tokenizer.name_or_path.lower()]
else:
logger.warning(
f"Special tokens lookup for tokenizer {tokenizer.name_or_path} failed."
)
return SpecialTokensMixin()