ailia_tokenizer package

Classes

class ailia_tokenizer.AiliaTokenizerResult(input_ids, attention_mask, sequence_ids, word_ids, char_starts, char_ends)

Bases: object

char_to_token(batch_or_char_index: int, char_index=None)
items()
keys()
sequence_ids(batch)
token_to_word(batch_or_token_index, token_index=None)
word_ids(batch_index=0)
word_to_chars(batch_or_word_index, word_index=None, sequence_index=None)
class ailia_tokenizer.PreTrainedTokenizer

Bases: object

add_special_tokens(special_tokens_dict)
batch_decode(sequences: List[List[int]], skip_special_tokens=False) List[str]
batch_encode_plus(text: str | List[str] | List[List[str]], text_pair=None, padding=True, truncation=True, return_tensors=None, max_length=None, split_special_tokens=False, return_token_type_ids=None, add_special_tokens=True)
convert_ids_to_tokens(ids: int | List[int]) str | List[str]
convert_tokens_to_ids(tokens: str | List[str]) int | List[int]
decode(input_ids: List[int], skip_special_tokens=False) str
encode(text: str, text_pair=None, padding=True, truncation=True, return_tensors=None, max_length=None, split_special_tokens=False, return_token_type_ids=None, add_special_tokens=True)
encode_plus(text: str, text_pair=None, padding=True, truncation=True, return_tensors=None, max_length=None, split_special_tokens=False, return_token_type_ids=None, add_special_tokens=True)
tokenize(text: str) List[str]
class ailia_tokenizer.WhisperTokenizer

Bases: PreTrainedTokenizer

classmethod from_pretrained(pretrained_model_name_or_path=None)
class ailia_tokenizer.CLIPTokenizer

Bases: PreTrainedTokenizer

classmethod from_pretrained(pretrained_model_name_or_path=None)
class ailia_tokenizer.XLMRobertaTokenizer

Bases: PreTrainedTokenizer

classmethod from_pretrained(pretrained_model_name_or_path)
class ailia_tokenizer.MarianTokenizer

Bases: PreTrainedTokenizer

classmethod from_pretrained(pretrained_model_name_or_path)
class ailia_tokenizer.BertJapaneseWordPieceTokenizer

Bases: PreTrainedTokenizer

classmethod from_pretrained(pretrained_model_name_or_path, dict_path)
class ailia_tokenizer.BertJapaneseCharacterTokenizer

Bases: PreTrainedTokenizer

classmethod from_pretrained(pretrained_model_name_or_path, dict_path)
class ailia_tokenizer.T5Tokenizer

Bases: PreTrainedTokenizer

classmethod from_pretrained(pretrained_model_name_or_path)
class ailia_tokenizer.RobertaTokenizer

Bases: PreTrainedTokenizer

classmethod from_pretrained(pretrained_model_name_or_path)
class ailia_tokenizer.BertTokenizer

Bases: PreTrainedTokenizer

classmethod from_pretrained(pretrained_model_name_or_path)
class ailia_tokenizer.GPT2Tokenizer

Bases: PreTrainedTokenizer

classmethod from_pretrained(pretrained_model_name_or_path)
class ailia_tokenizer.LlamaTokenizer

Bases: PreTrainedTokenizer

classmethod from_pretrained(pretrained_model_name_or_path)