ailia_tokenizer  1.3.0.0
Macros | Functions
ailia_tokenizer.h File Reference
#include "ailia.h"
Include dependency graph for ailia_tokenizer.h:

Go to the source code of this file.

Macros

#define AILIA_API   __stdcall
 
#define AILIA_TOKENIZER_TYPE_WHISPER   (0)
 Tokenizer for Whisper. More...
 
#define AILIA_TOKENIZER_TYPE_CLIP   (1)
 Tokenizer for Clip. More...
 
#define AILIA_TOKENIZER_TYPE_XLM_ROBERTA   (2)
 Tokenizer for XLM_ROBERTA. More...
 
#define AILIA_TOKENIZER_TYPE_MARIAN   (3)
 Tokenizer for MARIAN. More...
 
#define AILIA_TOKENIZER_TYPE_BERT_JAPANESE_WORDPIECE   (4)
 Tokenizer for Japanese BERT. More...
 
#define AILIA_TOKENIZER_TYPE_BERT_JAPANESE_CHARACTER   (5)
 Tokenizer for Japanese BERT. More...
 
#define AILIA_TOKENIZER_TYPE_T5   (6)
 Tokenizer for T5. More...
 
#define AILIA_TOKENIZER_TYPE_ROBERTA   (7)
 Tokenizer for RoBERTa. More...
 
#define AILIA_TOKENIZER_TYPE_BERT   (8)
 Tokenizer for BERT. More...
 
#define AILIA_TOKENIZER_TYPE_GPT2   (9)
 Tokenizer for GPT2. More...
 
#define AILIA_TOKENIZER_TYPE_LLAMA   (10)
 Tokenizer for LLAMA. More...
 
#define AILIA_TOKENIZER_FLAG_NONE   (0)
 Default flag. More...
 
#define AILIA_TOKENIZER_FLAG_UTF8_SAFE   (1)
 Output only characters valid as UTF8. More...
 
#define ailiaTokenizerOpenModelFile   ailiaTokenizerOpenModelFileA
 
#define ailiaTokenizerOpenDictionaryFile   ailiaTokenizerOpenDictionaryFileA
 
#define ailiaTokenizerOpenVocabFile   ailiaTokenizerOpenVocabFileA
 
#define ailiaTokenizerOpenMergeFile   ailiaTokenizerOpenMergeFileA
 
#define ailiaTokenizerOpenAddedTokensFile   ailiaTokenizerOpenAddedTokensFileA
 
#define ailiaTokenizerOpenTokenizerConfigFile   ailiaTokenizerOpenTokenizerConfigFileA
 

Functions

int AILIA_API ailiaTokenizerCreate (struct AILIATokenizer **net, int type, int flags)
 Creates a tokenizer instance. More...
 
int AILIA_API ailiaTokenizerOpenModelFileA (struct AILIATokenizer *net, const char *path)
 Open model file. More...
 
int AILIA_API ailiaTokenizerOpenModelFileW (struct AILIATokenizer *net, const wchar_t *path)
 
int AILIA_API ailiaTokenizerOpenDictionaryFileA (struct AILIATokenizer *net, const char *path)
 Open dictionary file. More...
 
int AILIA_API ailiaTokenizerOpenDictionaryFileW (struct AILIATokenizer *net, const wchar_t *path)
 
int AILIA_API ailiaTokenizerOpenVocabFileA (struct AILIATokenizer *net, const char *path)
 Open vocab file. More...
 
int AILIA_API ailiaTokenizerOpenVocabFileW (struct AILIATokenizer *net, const wchar_t *path)
 
int AILIA_API ailiaTokenizerOpenMergeFileA (struct AILIATokenizer *net, const char *path)
 Open merges file. More...
 
int AILIA_API ailiaTokenizerOpenMergeFileW (struct AILIATokenizer *net, const wchar_t *path)
 
int AILIA_API ailiaTokenizerOpenAddedTokensFileA (struct AILIATokenizer *net, const char *path)
 Open added tokens file. More...
 
int AILIA_API ailiaTokenizerOpenAddedTokensFileW (struct AILIATokenizer *net, const wchar_t *path)
 
int AILIA_API ailiaTokenizerOpenTokenizerConfigFileA (struct AILIATokenizer *net, const char *path)
 Open tokenizer config file. More...
 
int AILIA_API ailiaTokenizerOpenTokenizerConfigFileW (struct AILIATokenizer *net, const wchar_t *path)
 
int AILIA_API ailiaTokenizerEncode (struct AILIATokenizer *net, const char *utf8)
 Perform encode. More...
 
int AILIA_API ailiaTokenizerEncodeWithSpecialTokens (struct AILIATokenizer *net, const char *utf8)
 Perform encode with special tokens. More...
 
int AILIA_API ailiaTokenizerGetTokenCount (struct AILIATokenizer *net, unsigned int *count)
 Gets the number of tokens. More...
 
int AILIA_API ailiaTokenizerGetTokens (struct AILIATokenizer *net, int *tokens, unsigned int count)
 Gets the tokens. More...
 
int AILIA_API ailiaTokenizerGetWordIDs (struct AILIATokenizer *net, int *word_ids, unsigned int count)
 Gets the word ID. More...
 
int AILIA_API ailiaTokenizerGetCharStarts (struct AILIATokenizer *net, int *char_starts, unsigned int count)
 Gets the Char start positions. More...
 
int AILIA_API ailiaTokenizerGetCharEnds (struct AILIATokenizer *net, int *char_ends, unsigned int count)
 Gets the Char end positions. More...
 
int AILIA_API ailiaTokenizerDecode (struct AILIATokenizer *net, const int *tokens, unsigned int token_count)
 Perform decode. More...
 
int AILIA_API ailiaTokenizerDecodeWithSpecialTokens (struct AILIATokenizer *net, const int *tokens, unsigned int token_count)
 Perform decode with special tokens. More...
 
int AILIA_API ailiaTokenizerGetTextLength (struct AILIATokenizer *net, unsigned int *len)
 Gets the size of text. (Include null) More...
 
int AILIA_API ailiaTokenizerGetText (struct AILIATokenizer *net, char *text, unsigned int len)
 Gets the decoded text. More...
 
int AILIA_API ailiaTokenizerGetVocabSize (struct AILIATokenizer *net, unsigned int *size)
 Gets the size of vocab. (Include null) More...
 
int AILIA_API ailiaTokenizerGetVocab (struct AILIATokenizer *net, int token, const char **vocab)
 Perform encode. More...
 
void AILIA_API ailiaTokenizerDestroy (struct AILIATokenizer *net)
 It destroys the tokenizer instance. More...
 
int AILIA_API ailiaTokenizerUtf8ToUtf32 (unsigned int *utf32, unsigned int *processed_byte, const char *utf8, unsigned int utf8_len)
 Convert UTF8 character to UTF32 character. More...
 
int AILIA_API ailiaTokenizerUtf32ToUtf8 (char *utf8, unsigned int *processed_byte, unsigned int utf32)
 Convert UTF32 character to UTF8 character. More...
 

Macro Definition Documentation

◆ AILIA_API

#define AILIA_API   __stdcall

◆ AILIA_TOKENIZER_FLAG_NONE

#define AILIA_TOKENIZER_FLAG_NONE   (0)

Default flag.

◆ AILIA_TOKENIZER_FLAG_UTF8_SAFE

#define AILIA_TOKENIZER_FLAG_UTF8_SAFE   (1)

Output only characters valid as UTF8.

◆ AILIA_TOKENIZER_TYPE_BERT

#define AILIA_TOKENIZER_TYPE_BERT   (8)

Tokenizer for BERT.

◆ AILIA_TOKENIZER_TYPE_BERT_JAPANESE_CHARACTER

#define AILIA_TOKENIZER_TYPE_BERT_JAPANESE_CHARACTER   (5)

Tokenizer for Japanese BERT.

The input text is internally normalized in Unicode format using NFKC format.

◆ AILIA_TOKENIZER_TYPE_BERT_JAPANESE_WORDPIECE

#define AILIA_TOKENIZER_TYPE_BERT_JAPANESE_WORDPIECE   (4)

Tokenizer for Japanese BERT.

The input text is internally normalized in Unicode format using NFKC format.

◆ AILIA_TOKENIZER_TYPE_CLIP

#define AILIA_TOKENIZER_TYPE_CLIP   (1)

Tokenizer for Clip.

◆ AILIA_TOKENIZER_TYPE_GPT2

#define AILIA_TOKENIZER_TYPE_GPT2   (9)

Tokenizer for GPT2.

◆ AILIA_TOKENIZER_TYPE_LLAMA

#define AILIA_TOKENIZER_TYPE_LLAMA   (10)

Tokenizer for LLAMA.

◆ AILIA_TOKENIZER_TYPE_MARIAN

#define AILIA_TOKENIZER_TYPE_MARIAN   (3)

Tokenizer for MARIAN.

◆ AILIA_TOKENIZER_TYPE_ROBERTA

#define AILIA_TOKENIZER_TYPE_ROBERTA   (7)

Tokenizer for RoBERTa.

◆ AILIA_TOKENIZER_TYPE_T5

#define AILIA_TOKENIZER_TYPE_T5   (6)

Tokenizer for T5.

Inserts the EOS symbol at the end, similar to add_special_tokens=True. If you want the same behavior as add_special_tokens=False, such as japanese_clip, remove the EOS symbol at the end of the output.

◆ AILIA_TOKENIZER_TYPE_WHISPER

#define AILIA_TOKENIZER_TYPE_WHISPER   (0)

Tokenizer for Whisper.

◆ AILIA_TOKENIZER_TYPE_XLM_ROBERTA

#define AILIA_TOKENIZER_TYPE_XLM_ROBERTA   (2)

Tokenizer for XLM_ROBERTA.

◆ ailiaTokenizerOpenAddedTokensFile

#define ailiaTokenizerOpenAddedTokensFile   ailiaTokenizerOpenAddedTokensFileA

◆ ailiaTokenizerOpenDictionaryFile

#define ailiaTokenizerOpenDictionaryFile   ailiaTokenizerOpenDictionaryFileA

◆ ailiaTokenizerOpenMergeFile

#define ailiaTokenizerOpenMergeFile   ailiaTokenizerOpenMergeFileA

◆ ailiaTokenizerOpenModelFile

#define ailiaTokenizerOpenModelFile   ailiaTokenizerOpenModelFileA

◆ ailiaTokenizerOpenTokenizerConfigFile

#define ailiaTokenizerOpenTokenizerConfigFile   ailiaTokenizerOpenTokenizerConfigFileA

◆ ailiaTokenizerOpenVocabFile

#define ailiaTokenizerOpenVocabFile   ailiaTokenizerOpenVocabFileA

Function Documentation

◆ ailiaTokenizerCreate()

int AILIA_API ailiaTokenizerCreate ( struct AILIATokenizer **  net,
int  type,
int  flags 
)

Creates a tokenizer instance.

Parameters
netA pointer to the tokenizer instance pointer
typeAILIA_TOKENIZER_TYPE_*
flagOR of AILIA_TOKENIZER_FLAG_*
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

Creates a tokenizer instance.

◆ ailiaTokenizerDecode()

int AILIA_API ailiaTokenizerDecode ( struct AILIATokenizer *  net,
const int *  tokens,
unsigned int  token_count 
)

Perform decode.

Parameters
netA tokenizer instance pointer
tokensTokens for decode
token_countThe number of tokens
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

Get the decoded result with ailiaTokenizerGetText API. Similarly to skip_special_tokens=True, special tokens will not be output.

◆ ailiaTokenizerDecodeWithSpecialTokens()

int AILIA_API ailiaTokenizerDecodeWithSpecialTokens ( struct AILIATokenizer *  net,
const int *  tokens,
unsigned int  token_count 
)

Perform decode with special tokens.

Parameters
netA tokenizer instance pointer
tokensTokens for decode
token_countThe number of tokens
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

Get the decoded result with ailiaTokenizerGetText API. Similarly to skip_special_tokens=False, special tokens will be output.

◆ ailiaTokenizerDestroy()

void AILIA_API ailiaTokenizerDestroy ( struct AILIATokenizer *  net)

It destroys the tokenizer instance.

Parameters
netA tokenizer instance pointer

◆ ailiaTokenizerEncode()

int AILIA_API ailiaTokenizerEncode ( struct AILIATokenizer *  net,
const char *  utf8 
)

Perform encode.

Parameters
netA tokenizer instance pointer
textText for encode (UTF8)
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

Get the encoded result with ailiaTokenizerGetTokens API. Just like with split_special_tokens=True, Special Tokens are treated as strings by splitting them.

◆ ailiaTokenizerEncodeWithSpecialTokens()

int AILIA_API ailiaTokenizerEncodeWithSpecialTokens ( struct AILIATokenizer *  net,
const char *  utf8 
)

Perform encode with special tokens.

Parameters
netA tokenizer instance pointer
textText for encode (UTF8)
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

Get the encoded result with ailiaTokenizerGetTokens API. Similarly to split_special_tokens=False, special tokens will be output.

◆ ailiaTokenizerGetCharEnds()

int AILIA_API ailiaTokenizerGetCharEnds ( struct AILIATokenizer *  net,
int *  char_ends,
unsigned int  count 
)

Gets the Char end positions.

Parameters
netA tokenizer instance pointer
char_endsChar end position
countToken count
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

If ailiaTokenizerEncode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE . This is valid only for AILIA_TOKENIZER_TYPE_ROBERTA and AILIA_TOKENIZER_TYPE_BERT. The character end positions in UTF-32 units corresponding to each token are returned.

◆ ailiaTokenizerGetCharStarts()

int AILIA_API ailiaTokenizerGetCharStarts ( struct AILIATokenizer *  net,
int *  char_starts,
unsigned int  count 
)

Gets the Char start positions.

Parameters
netA tokenizer instance pointer
char_startsCharacter start position
countToken count
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

If ailiaTokenizerEncode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE . This is valid only for AILIA_TOKENIZER_TYPE_ROBERTA and AILIA_TOKENIZER_TYPE_BERT. The character start positions in UTF-32 units corresponding to each token are returned.

◆ ailiaTokenizerGetText()

int AILIA_API ailiaTokenizerGetText ( struct AILIATokenizer *  net,
char *  text,
unsigned int  len 
)

Gets the decoded text.

Parameters
netA tokenizer instance pointer
textText(UTF8)
lenBuffer size
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

If ailiaTokenizerDecode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE .

◆ ailiaTokenizerGetTextLength()

int AILIA_API ailiaTokenizerGetTextLength ( struct AILIATokenizer *  net,
unsigned int *  len 
)

Gets the size of text. (Include null)

Parameters
netA tokenizer instance pointer
lenThe length of text
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

◆ ailiaTokenizerGetTokenCount()

int AILIA_API ailiaTokenizerGetTokenCount ( struct AILIATokenizer *  net,
unsigned int *  count 
)

Gets the number of tokens.

Parameters
netA tokenizer instance pointer
countThe number of objects
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

◆ ailiaTokenizerGetTokens()

int AILIA_API ailiaTokenizerGetTokens ( struct AILIATokenizer *  net,
int *  tokens,
unsigned int  count 
)

Gets the tokens.

Parameters
netA tokenizer instance pointer
tokensToken
countToken count
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

If ailiaTokenizerEncode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE .

◆ ailiaTokenizerGetVocab()

int AILIA_API ailiaTokenizerGetVocab ( struct AILIATokenizer *  net,
int  token,
const char **  vocab 
)

Perform encode.

Parameters
netA tokenizer instance pointer
tokenToken
textText of vocab (UTF8)
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

There is no need to release the vocab. The validity period of the vocab will last until the next time the ailiaTokenizer API is called.

◆ ailiaTokenizerGetVocabSize()

int AILIA_API ailiaTokenizerGetVocabSize ( struct AILIATokenizer *  net,
unsigned int *  size 
)

Gets the size of vocab. (Include null)

Parameters
netA tokenizer instance pointer
sizeThe size of vocab
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

◆ ailiaTokenizerGetWordIDs()

int AILIA_API ailiaTokenizerGetWordIDs ( struct AILIATokenizer *  net,
int *  word_ids,
unsigned int  count 
)

Gets the word ID.

Parameters
netA tokenizer instance pointer
word_idsWord ID
countToken count
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

If ailiaTokenizerEncode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE . This is valid only for AILIA_TOKENIZER_TYPE_ROBERTA and AILIA_TOKENIZER_TYPE_BERT.

◆ ailiaTokenizerOpenAddedTokensFileA()

int AILIA_API ailiaTokenizerOpenAddedTokensFileA ( struct AILIATokenizer *  net,
const char *  path 
)

Open added tokens file.

Parameters
netA pointer to the tokenizer instance pointer
pathPath for special token file
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

Open a added tokens file (json). This API only requires for AILIA_TOKENIZER_TYPE_WHISPER.

◆ ailiaTokenizerOpenAddedTokensFileW()

int AILIA_API ailiaTokenizerOpenAddedTokensFileW ( struct AILIATokenizer *  net,
const wchar_t *  path 
)

◆ ailiaTokenizerOpenDictionaryFileA()

int AILIA_API ailiaTokenizerOpenDictionaryFileA ( struct AILIATokenizer *  net,
const char *  path 
)

Open dictionary file.

Parameters
netA pointer to the tokenizer instance pointer
pathPath for dictionary of Mecab
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

Open a model file for Mecab. This API only requires for AILIA_TOKENIZER_TYPE_BERT_JAPANESE_XXX.

◆ ailiaTokenizerOpenDictionaryFileW()

int AILIA_API ailiaTokenizerOpenDictionaryFileW ( struct AILIATokenizer *  net,
const wchar_t *  path 
)

◆ ailiaTokenizerOpenMergeFileA()

int AILIA_API ailiaTokenizerOpenMergeFileA ( struct AILIATokenizer *  net,
const char *  path 
)

Open merges file.

Parameters
netA pointer to the tokenizer instance pointer
pathPath for merges file
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

Open a merge file (txt). This API only requires for AILIA_TOKENIZER_TYPE_ROBERTA or AILIA_TOKENIZER_TYPE_WHISPER or AILIA_TOKENIZER_TYPE_GPT2.

◆ ailiaTokenizerOpenMergeFileW()

int AILIA_API ailiaTokenizerOpenMergeFileW ( struct AILIATokenizer *  net,
const wchar_t *  path 
)

◆ ailiaTokenizerOpenModelFileA()

int AILIA_API ailiaTokenizerOpenModelFileA ( struct AILIATokenizer *  net,
const char *  path 
)

Open model file.

Parameters
netA pointer to the tokenizer instance pointer
pathPath for SentencePiece
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

Open a model file for SentencePiece. This API only requires for AILIA_TOKENIZER_TYPE_XLM_ROBERTA or AILIA_TOKENIZER_TYPE_MARIAN.

◆ ailiaTokenizerOpenModelFileW()

int AILIA_API ailiaTokenizerOpenModelFileW ( struct AILIATokenizer *  net,
const wchar_t *  path 
)

◆ ailiaTokenizerOpenTokenizerConfigFileA()

int AILIA_API ailiaTokenizerOpenTokenizerConfigFileA ( struct AILIATokenizer *  net,
const char *  path 
)

Open tokenizer config file.

Parameters
netA pointer to the tokenizer instance pointer
pathPath for config file
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

Open a tokenizer config file (json). This API only requires for AILIA_TOKENIZER_TYPE_BERT.

◆ ailiaTokenizerOpenTokenizerConfigFileW()

int AILIA_API ailiaTokenizerOpenTokenizerConfigFileW ( struct AILIATokenizer *  net,
const wchar_t *  path 
)

◆ ailiaTokenizerOpenVocabFileA()

int AILIA_API ailiaTokenizerOpenVocabFileA ( struct AILIATokenizer *  net,
const char *  path 
)

Open vocab file.

Parameters
netA pointer to the tokenizer instance pointer
pathPath for Vocab file
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

Open a vocab file (json for ROBERTA or WHISPER or GPT2, txt for others).

◆ ailiaTokenizerOpenVocabFileW()

int AILIA_API ailiaTokenizerOpenVocabFileW ( struct AILIATokenizer *  net,
const wchar_t *  path 
)

◆ ailiaTokenizerUtf32ToUtf8()

int AILIA_API ailiaTokenizerUtf32ToUtf8 ( char *  utf8,
unsigned int *  processed_byte,
unsigned int  utf32 
)

Convert UTF32 character to UTF8 character.

Parameters
utf8UTF8 character(Require greater than 4byte)
processed_byteProcessed bytes on UTF8
utf32UTF32 character
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

◆ ailiaTokenizerUtf8ToUtf32()

int AILIA_API ailiaTokenizerUtf8ToUtf32 ( unsigned int *  utf32,
unsigned int *  processed_byte,
const char *  utf8,
unsigned int  utf8_len 
)

Convert UTF8 character to UTF32 character.

Parameters
utf32UTF32の文字
processed_byteProcessed bytes on UTF8
utf8UTF8の文字
utf8_lenBuffer Size
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.