ailia_tokenizer
1.4.0.0
|
#include "ailia.h"
Go to the source code of this file.
Functions | |
int AILIA_API | ailiaTokenizerCreate (struct AILIATokenizer **net, int type, int flags) |
Creates a tokenizer instance. More... | |
int AILIA_API | ailiaTokenizerOpenModelFileA (struct AILIATokenizer *net, const char *path) |
Open model file. More... | |
int AILIA_API | ailiaTokenizerOpenModelFileW (struct AILIATokenizer *net, const wchar_t *path) |
int AILIA_API | ailiaTokenizerOpenDictionaryFileA (struct AILIATokenizer *net, const char *path) |
Open dictionary file. More... | |
int AILIA_API | ailiaTokenizerOpenDictionaryFileW (struct AILIATokenizer *net, const wchar_t *path) |
int AILIA_API | ailiaTokenizerOpenVocabFileA (struct AILIATokenizer *net, const char *path) |
Open vocab file. More... | |
int AILIA_API | ailiaTokenizerOpenVocabFileW (struct AILIATokenizer *net, const wchar_t *path) |
int AILIA_API | ailiaTokenizerOpenMergeFileA (struct AILIATokenizer *net, const char *path) |
Open merges file. More... | |
int AILIA_API | ailiaTokenizerOpenMergeFileW (struct AILIATokenizer *net, const wchar_t *path) |
int AILIA_API | ailiaTokenizerOpenAddedTokensFileA (struct AILIATokenizer *net, const char *path) |
Open added tokens file. More... | |
int AILIA_API | ailiaTokenizerOpenAddedTokensFileW (struct AILIATokenizer *net, const wchar_t *path) |
int AILIA_API | ailiaTokenizerOpenTokenizerConfigFileA (struct AILIATokenizer *net, const char *path) |
Open tokenizer config file. More... | |
int AILIA_API | ailiaTokenizerOpenTokenizerConfigFileW (struct AILIATokenizer *net, const wchar_t *path) |
int AILIA_API | ailiaTokenizerEncode (struct AILIATokenizer *net, const char *utf8) |
Perform encode. More... | |
int AILIA_API | ailiaTokenizerEncodeWithSpecialTokens (struct AILIATokenizer *net, const char *utf8) |
Perform encode with special tokens. More... | |
int AILIA_API | ailiaTokenizerGetTokenCount (struct AILIATokenizer *net, unsigned int *count) |
Gets the number of tokens. More... | |
int AILIA_API | ailiaTokenizerGetTokens (struct AILIATokenizer *net, int *tokens, unsigned int count) |
Gets the tokens. More... | |
int AILIA_API | ailiaTokenizerGetWordIDs (struct AILIATokenizer *net, int *word_ids, unsigned int count) |
Gets the word ID. More... | |
int AILIA_API | ailiaTokenizerGetCharStarts (struct AILIATokenizer *net, int *char_starts, unsigned int count) |
Gets the Char start positions. More... | |
int AILIA_API | ailiaTokenizerGetCharEnds (struct AILIATokenizer *net, int *char_ends, unsigned int count) |
Gets the Char end positions. More... | |
int AILIA_API | ailiaTokenizerDecode (struct AILIATokenizer *net, const int *tokens, unsigned int token_count) |
Perform decode. More... | |
int AILIA_API | ailiaTokenizerDecodeWithSpecialTokens (struct AILIATokenizer *net, const int *tokens, unsigned int token_count) |
Perform decode with special tokens. More... | |
int AILIA_API | ailiaTokenizerGetTextLength (struct AILIATokenizer *net, unsigned int *len) |
Gets the size of text. (Include null) More... | |
int AILIA_API | ailiaTokenizerGetText (struct AILIATokenizer *net, char *text, unsigned int len) |
Gets the decoded text. More... | |
int AILIA_API | ailiaTokenizerGetVocabSize (struct AILIATokenizer *net, unsigned int *size) |
Gets the size of vocab. (Include null) More... | |
int AILIA_API | ailiaTokenizerGetVocab (struct AILIATokenizer *net, int token, const char **vocab) |
Perform encode. More... | |
int AILIA_API | ailiaTokenizerAddSpecialTokens (struct AILIATokenizer *net, const char **tokens, unsigned int count) |
Add SpecialToken. More... | |
void AILIA_API | ailiaTokenizerDestroy (struct AILIATokenizer *net) |
It destroys the tokenizer instance. More... | |
int AILIA_API | ailiaTokenizerUtf8ToUtf32 (unsigned int *utf32, unsigned int *processed_byte, const char *utf8, unsigned int utf8_len) |
Convert UTF8 character to UTF32 character. More... | |
int AILIA_API | ailiaTokenizerUtf32ToUtf8 (char *utf8, unsigned int *processed_byte, unsigned int utf32) |
Convert UTF32 character to UTF8 character. More... | |
#define AILIA_API __stdcall |
#define AILIA_TOKENIZER_FLAG_NONE (0) |
Default flag.
#define AILIA_TOKENIZER_FLAG_UTF8_SAFE (1) |
Output only characters valid as UTF8.
#define AILIA_TOKENIZER_TYPE_BERT (8) |
Tokenizer for BERT.
#define AILIA_TOKENIZER_TYPE_BERT_JAPANESE_CHARACTER (5) |
Tokenizer for Japanese BERT.
The input text is internally normalized in Unicode format using NFKC format.
#define AILIA_TOKENIZER_TYPE_BERT_JAPANESE_WORDPIECE (4) |
Tokenizer for Japanese BERT.
The input text is internally normalized in Unicode format using NFKC format.
#define AILIA_TOKENIZER_TYPE_CLIP (1) |
Tokenizer for Clip.
#define AILIA_TOKENIZER_TYPE_GPT2 (9) |
Tokenizer for GPT2.
#define AILIA_TOKENIZER_TYPE_LLAMA (10) |
Tokenizer for LLAMA.
#define AILIA_TOKENIZER_TYPE_MARIAN (3) |
Tokenizer for MARIAN.
#define AILIA_TOKENIZER_TYPE_ROBERTA (7) |
Tokenizer for RoBERTa.
#define AILIA_TOKENIZER_TYPE_T5 (6) |
Tokenizer for T5.
Inserts the EOS symbol at the end, similar to add_special_tokens=True. If you want the same behavior as add_special_tokens=False, such as japanese_clip, remove the EOS symbol at the end of the output.
#define AILIA_TOKENIZER_TYPE_WHISPER (0) |
Tokenizer for Whisper.
#define AILIA_TOKENIZER_TYPE_XLM_ROBERTA (2) |
Tokenizer for XLM_ROBERTA.
#define ailiaTokenizerOpenAddedTokensFile ailiaTokenizerOpenAddedTokensFileA |
#define ailiaTokenizerOpenDictionaryFile ailiaTokenizerOpenDictionaryFileA |
#define ailiaTokenizerOpenMergeFile ailiaTokenizerOpenMergeFileA |
#define ailiaTokenizerOpenModelFile ailiaTokenizerOpenModelFileA |
#define ailiaTokenizerOpenTokenizerConfigFile ailiaTokenizerOpenTokenizerConfigFileA |
#define ailiaTokenizerOpenVocabFile ailiaTokenizerOpenVocabFileA |
int AILIA_API ailiaTokenizerAddSpecialTokens | ( | struct AILIATokenizer * | net, |
const char ** | tokens, | ||
unsigned int | count | ||
) |
Add SpecialToken.
net | A tokenizer instance pointer |
tokens | Token(UTF8) |
count | The number of tokens |
This is valid only for AILIA_TOKENIZER_TYPE_ROBERTA and AILIA_TOKENIZER_TYPE_ROBERTA.
int AILIA_API ailiaTokenizerCreate | ( | struct AILIATokenizer ** | net, |
int | type, | ||
int | flags | ||
) |
Creates a tokenizer instance.
net | A pointer to the tokenizer instance pointer |
type | AILIA_TOKENIZER_TYPE_* |
flag | OR of AILIA_TOKENIZER_FLAG_* |
Creates a tokenizer instance.
int AILIA_API ailiaTokenizerDecode | ( | struct AILIATokenizer * | net, |
const int * | tokens, | ||
unsigned int | token_count | ||
) |
Perform decode.
net | A tokenizer instance pointer |
tokens | Tokens for decode |
token_count | The number of tokens |
Get the decoded result with ailiaTokenizerGetText API. Similarly to skip_special_tokens=True, special tokens will not be output.
int AILIA_API ailiaTokenizerDecodeWithSpecialTokens | ( | struct AILIATokenizer * | net, |
const int * | tokens, | ||
unsigned int | token_count | ||
) |
Perform decode with special tokens.
net | A tokenizer instance pointer |
tokens | Tokens for decode |
token_count | The number of tokens |
Get the decoded result with ailiaTokenizerGetText API. Similarly to skip_special_tokens=False, special tokens will be output.
void AILIA_API ailiaTokenizerDestroy | ( | struct AILIATokenizer * | net | ) |
It destroys the tokenizer instance.
net | A tokenizer instance pointer |
int AILIA_API ailiaTokenizerEncode | ( | struct AILIATokenizer * | net, |
const char * | utf8 | ||
) |
Perform encode.
net | A tokenizer instance pointer |
text | Text for encode (UTF8) |
Get the encoded result with ailiaTokenizerGetTokens API. Just like with split_special_tokens=True, Special Tokens are treated as strings by splitting them.
int AILIA_API ailiaTokenizerEncodeWithSpecialTokens | ( | struct AILIATokenizer * | net, |
const char * | utf8 | ||
) |
Perform encode with special tokens.
net | A tokenizer instance pointer |
text | Text for encode (UTF8) |
Get the encoded result with ailiaTokenizerGetTokens API. Similarly to split_special_tokens=False, special tokens will be output.
int AILIA_API ailiaTokenizerGetCharEnds | ( | struct AILIATokenizer * | net, |
int * | char_ends, | ||
unsigned int | count | ||
) |
Gets the Char end positions.
net | A tokenizer instance pointer |
char_ends | Char end position |
count | Token count |
If ailiaTokenizerEncode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE . This is valid only for AILIA_TOKENIZER_TYPE_ROBERTA and AILIA_TOKENIZER_TYPE_BERT. The character end positions in UTF-32 units corresponding to each token are returned.
int AILIA_API ailiaTokenizerGetCharStarts | ( | struct AILIATokenizer * | net, |
int * | char_starts, | ||
unsigned int | count | ||
) |
Gets the Char start positions.
net | A tokenizer instance pointer |
char_starts | Character start position |
count | Token count |
If ailiaTokenizerEncode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE . This is valid only for AILIA_TOKENIZER_TYPE_ROBERTA and AILIA_TOKENIZER_TYPE_BERT. The character start positions in UTF-32 units corresponding to each token are returned.
int AILIA_API ailiaTokenizerGetText | ( | struct AILIATokenizer * | net, |
char * | text, | ||
unsigned int | len | ||
) |
Gets the decoded text.
net | A tokenizer instance pointer |
text | Text(UTF8) |
len | Buffer size |
If ailiaTokenizerDecode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE .
int AILIA_API ailiaTokenizerGetTextLength | ( | struct AILIATokenizer * | net, |
unsigned int * | len | ||
) |
Gets the size of text. (Include null)
net | A tokenizer instance pointer |
len | The length of text |
int AILIA_API ailiaTokenizerGetTokenCount | ( | struct AILIATokenizer * | net, |
unsigned int * | count | ||
) |
Gets the number of tokens.
net | A tokenizer instance pointer |
count | The number of objects |
int AILIA_API ailiaTokenizerGetTokens | ( | struct AILIATokenizer * | net, |
int * | tokens, | ||
unsigned int | count | ||
) |
Gets the tokens.
net | A tokenizer instance pointer |
tokens | Token |
count | Token count |
If ailiaTokenizerEncode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE .
int AILIA_API ailiaTokenizerGetVocab | ( | struct AILIATokenizer * | net, |
int | token, | ||
const char ** | vocab | ||
) |
Perform encode.
net | A tokenizer instance pointer |
token | Token |
text | Text of vocab (UTF8) |
There is no need to release the vocab. The validity period of the vocab will last until the next time the ailiaTokenizer API is called.
int AILIA_API ailiaTokenizerGetVocabSize | ( | struct AILIATokenizer * | net, |
unsigned int * | size | ||
) |
Gets the size of vocab. (Include null)
net | A tokenizer instance pointer |
size | The size of vocab |
int AILIA_API ailiaTokenizerGetWordIDs | ( | struct AILIATokenizer * | net, |
int * | word_ids, | ||
unsigned int | count | ||
) |
Gets the word ID.
net | A tokenizer instance pointer |
word_ids | Word ID |
count | Token count |
If ailiaTokenizerEncode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE . This is valid only for AILIA_TOKENIZER_TYPE_ROBERTA and AILIA_TOKENIZER_TYPE_BERT.
int AILIA_API ailiaTokenizerOpenAddedTokensFileA | ( | struct AILIATokenizer * | net, |
const char * | path | ||
) |
Open added tokens file.
net | A pointer to the tokenizer instance pointer |
path | Path for special token file |
Open a added tokens file (json). This API only requires for AILIA_TOKENIZER_TYPE_WHISPER.
int AILIA_API ailiaTokenizerOpenAddedTokensFileW | ( | struct AILIATokenizer * | net, |
const wchar_t * | path | ||
) |
int AILIA_API ailiaTokenizerOpenDictionaryFileA | ( | struct AILIATokenizer * | net, |
const char * | path | ||
) |
Open dictionary file.
net | A pointer to the tokenizer instance pointer |
path | Path for dictionary of Mecab |
Open a model file for Mecab. This API only requires for AILIA_TOKENIZER_TYPE_BERT_JAPANESE_XXX.
int AILIA_API ailiaTokenizerOpenDictionaryFileW | ( | struct AILIATokenizer * | net, |
const wchar_t * | path | ||
) |
int AILIA_API ailiaTokenizerOpenMergeFileA | ( | struct AILIATokenizer * | net, |
const char * | path | ||
) |
Open merges file.
net | A pointer to the tokenizer instance pointer |
path | Path for merges file |
Open a merge file (txt). This API only requires for AILIA_TOKENIZER_TYPE_ROBERTA or AILIA_TOKENIZER_TYPE_WHISPER or AILIA_TOKENIZER_TYPE_GPT2.
int AILIA_API ailiaTokenizerOpenMergeFileW | ( | struct AILIATokenizer * | net, |
const wchar_t * | path | ||
) |
int AILIA_API ailiaTokenizerOpenModelFileA | ( | struct AILIATokenizer * | net, |
const char * | path | ||
) |
Open model file.
net | A pointer to the tokenizer instance pointer |
path | Path for SentencePiece |
Open a model file for SentencePiece. This API only requires for AILIA_TOKENIZER_TYPE_XLM_ROBERTA or AILIA_TOKENIZER_TYPE_MARIAN.
int AILIA_API ailiaTokenizerOpenModelFileW | ( | struct AILIATokenizer * | net, |
const wchar_t * | path | ||
) |
int AILIA_API ailiaTokenizerOpenTokenizerConfigFileA | ( | struct AILIATokenizer * | net, |
const char * | path | ||
) |
Open tokenizer config file.
net | A pointer to the tokenizer instance pointer |
path | Path for config file |
Open a tokenizer config file (json). This API only requires for AILIA_TOKENIZER_TYPE_BERT.
int AILIA_API ailiaTokenizerOpenTokenizerConfigFileW | ( | struct AILIATokenizer * | net, |
const wchar_t * | path | ||
) |
int AILIA_API ailiaTokenizerOpenVocabFileA | ( | struct AILIATokenizer * | net, |
const char * | path | ||
) |
Open vocab file.
net | A pointer to the tokenizer instance pointer |
path | Path for Vocab file |
Open a vocab file (json for ROBERTA or WHISPER or GPT2, txt for others).
int AILIA_API ailiaTokenizerOpenVocabFileW | ( | struct AILIATokenizer * | net, |
const wchar_t * | path | ||
) |
int AILIA_API ailiaTokenizerUtf32ToUtf8 | ( | char * | utf8, |
unsigned int * | processed_byte, | ||
unsigned int | utf32 | ||
) |
Convert UTF32 character to UTF8 character.
utf8 | UTF8 character(Require greater than 4byte) |
processed_byte | Processed bytes on UTF8 |
utf32 | UTF32 character |
int AILIA_API ailiaTokenizerUtf8ToUtf32 | ( | unsigned int * | utf32, |
unsigned int * | processed_byte, | ||
const char * | utf8, | ||
unsigned int | utf8_len | ||
) |
Convert UTF8 character to UTF32 character.
utf32 | UTF32の文字 |
processed_byte | Processed bytes on UTF8 |
utf8 | UTF8の文字 |
utf8_len | Buffer Size |