|
ailia_tokenizer
1.5.0.0
|
#include "ailia.h"
Go to the source code of this file.
Functions | |
| int AILIA_API | ailiaTokenizerCreate (struct AILIATokenizer **net, int type, int flags) |
| Creates a tokenizer instance. More... | |
| int AILIA_API | ailiaTokenizerOpenModelFileA (struct AILIATokenizer *net, const char *path) |
| Open model file. More... | |
| int AILIA_API | ailiaTokenizerOpenModelFileW (struct AILIATokenizer *net, const wchar_t *path) |
| int AILIA_API | ailiaTokenizerOpenDictionaryFileA (struct AILIATokenizer *net, const char *path) |
| Open dictionary file. More... | |
| int AILIA_API | ailiaTokenizerOpenDictionaryFileW (struct AILIATokenizer *net, const wchar_t *path) |
| int AILIA_API | ailiaTokenizerOpenVocabFileA (struct AILIATokenizer *net, const char *path) |
| Open vocab file. More... | |
| int AILIA_API | ailiaTokenizerOpenVocabFileW (struct AILIATokenizer *net, const wchar_t *path) |
| int AILIA_API | ailiaTokenizerOpenMergeFileA (struct AILIATokenizer *net, const char *path) |
| Open merges file. More... | |
| int AILIA_API | ailiaTokenizerOpenMergeFileW (struct AILIATokenizer *net, const wchar_t *path) |
| int AILIA_API | ailiaTokenizerOpenAddedTokensFileA (struct AILIATokenizer *net, const char *path) |
| Open added tokens file. More... | |
| int AILIA_API | ailiaTokenizerOpenAddedTokensFileW (struct AILIATokenizer *net, const wchar_t *path) |
| int AILIA_API | ailiaTokenizerOpenTokenizerConfigFileA (struct AILIATokenizer *net, const char *path) |
| Open tokenizer config file. More... | |
| int AILIA_API | ailiaTokenizerOpenTokenizerConfigFileW (struct AILIATokenizer *net, const wchar_t *path) |
| int AILIA_API | ailiaTokenizerEncode (struct AILIATokenizer *net, const char *utf8) |
| Perform encode. More... | |
| int AILIA_API | ailiaTokenizerEncodeWithSpecialTokens (struct AILIATokenizer *net, const char *utf8) |
| Perform encode with special tokens. More... | |
| int AILIA_API | ailiaTokenizerGetTokenCount (struct AILIATokenizer *net, unsigned int *count) |
| Gets the number of tokens. More... | |
| int AILIA_API | ailiaTokenizerGetTokens (struct AILIATokenizer *net, int *tokens, unsigned int count) |
| Gets the tokens. More... | |
| int AILIA_API | ailiaTokenizerGetWordIDs (struct AILIATokenizer *net, int *word_ids, unsigned int count) |
| Gets the word ID. More... | |
| int AILIA_API | ailiaTokenizerGetCharStarts (struct AILIATokenizer *net, int *char_starts, unsigned int count) |
| Gets the Char start positions. More... | |
| int AILIA_API | ailiaTokenizerGetCharEnds (struct AILIATokenizer *net, int *char_ends, unsigned int count) |
| Gets the Char end positions. More... | |
| int AILIA_API | ailiaTokenizerDecode (struct AILIATokenizer *net, const int *tokens, unsigned int token_count) |
| Perform decode. More... | |
| int AILIA_API | ailiaTokenizerDecodeWithSpecialTokens (struct AILIATokenizer *net, const int *tokens, unsigned int token_count) |
| Perform decode with special tokens. More... | |
| int AILIA_API | ailiaTokenizerGetTextLength (struct AILIATokenizer *net, unsigned int *len) |
| Gets the size of text. (Include null) More... | |
| int AILIA_API | ailiaTokenizerGetText (struct AILIATokenizer *net, char *text, unsigned int len) |
| Gets the decoded text. More... | |
| int AILIA_API | ailiaTokenizerGetVocabSize (struct AILIATokenizer *net, unsigned int *size) |
| Gets the size of vocab. (Include null) More... | |
| int AILIA_API | ailiaTokenizerGetVocab (struct AILIATokenizer *net, int token, const char **vocab) |
| Perform encode. More... | |
| int AILIA_API | ailiaTokenizerAddSpecialTokens (struct AILIATokenizer *net, const char **tokens, unsigned int count) |
| Add SpecialToken. More... | |
| void AILIA_API | ailiaTokenizerDestroy (struct AILIATokenizer *net) |
| It destroys the tokenizer instance. More... | |
| int AILIA_API | ailiaTokenizerUtf8ToUtf32 (unsigned int *utf32, unsigned int *processed_byte, const char *utf8, unsigned int utf8_len) |
| Convert UTF8 character to UTF32 character. More... | |
| int AILIA_API | ailiaTokenizerUtf32ToUtf8 (char *utf8, unsigned int *processed_byte, unsigned int utf32) |
| Convert UTF32 character to UTF8 character. More... | |
| #define AILIA_API __stdcall |
| #define AILIA_TOKENIZER_FLAG_NONE (0) |
Default flag.
| #define AILIA_TOKENIZER_FLAG_UTF8_SAFE (1) |
Output only characters valid as UTF8.
| #define AILIA_TOKENIZER_TYPE_BERT (8) |
Tokenizer for BERT.
| #define AILIA_TOKENIZER_TYPE_BERT_JAPANESE_CHARACTER (5) |
Tokenizer for Japanese BERT.
The input text is internally normalized in Unicode format using NFKC format.
| #define AILIA_TOKENIZER_TYPE_BERT_JAPANESE_WORDPIECE (4) |
Tokenizer for Japanese BERT.
The input text is internally normalized in Unicode format using NFKC format.
| #define AILIA_TOKENIZER_TYPE_CLIP (1) |
Tokenizer for Clip.
| #define AILIA_TOKENIZER_TYPE_GPT2 (9) |
Tokenizer for GPT2.
| #define AILIA_TOKENIZER_TYPE_LLAMA (10) |
Tokenizer for LLAMA.
| #define AILIA_TOKENIZER_TYPE_MARIAN (3) |
Tokenizer for MARIAN.
| #define AILIA_TOKENIZER_TYPE_ROBERTA (7) |
Tokenizer for RoBERTa.
| #define AILIA_TOKENIZER_TYPE_T5 (6) |
Tokenizer for T5.
Inserts the EOS symbol at the end, similar to add_special_tokens=True. If you want the same behavior as add_special_tokens=False, such as japanese_clip, remove the EOS symbol at the end of the output.
| #define AILIA_TOKENIZER_TYPE_WHISPER (0) |
Tokenizer for Whisper.
| #define AILIA_TOKENIZER_TYPE_XLM_ROBERTA (2) |
Tokenizer for XLM_ROBERTA.
| #define ailiaTokenizerOpenAddedTokensFile ailiaTokenizerOpenAddedTokensFileA |
| #define ailiaTokenizerOpenDictionaryFile ailiaTokenizerOpenDictionaryFileA |
| #define ailiaTokenizerOpenMergeFile ailiaTokenizerOpenMergeFileA |
| #define ailiaTokenizerOpenModelFile ailiaTokenizerOpenModelFileA |
| #define ailiaTokenizerOpenTokenizerConfigFile ailiaTokenizerOpenTokenizerConfigFileA |
| #define ailiaTokenizerOpenVocabFile ailiaTokenizerOpenVocabFileA |
| int AILIA_API ailiaTokenizerAddSpecialTokens | ( | struct AILIATokenizer * | net, |
| const char ** | tokens, | ||
| unsigned int | count | ||
| ) |
Add SpecialToken.
| net | A tokenizer instance pointer |
| tokens | Token(UTF8) |
| count | The number of tokens |
This is valid only for AILIA_TOKENIZER_TYPE_ROBERTA and AILIA_TOKENIZER_TYPE_GPT2.
| int AILIA_API ailiaTokenizerCreate | ( | struct AILIATokenizer ** | net, |
| int | type, | ||
| int | flags | ||
| ) |
Creates a tokenizer instance.
| net | A pointer to the tokenizer instance pointer |
| type | AILIA_TOKENIZER_TYPE_* |
| flag | OR of AILIA_TOKENIZER_FLAG_* |
Creates a tokenizer instance.
| int AILIA_API ailiaTokenizerDecode | ( | struct AILIATokenizer * | net, |
| const int * | tokens, | ||
| unsigned int | token_count | ||
| ) |
Perform decode.
| net | A tokenizer instance pointer |
| tokens | Tokens for decode |
| token_count | The number of tokens |
Get the decoded result with ailiaTokenizerGetText API. Similarly to skip_special_tokens=True, special tokens will not be output.
| int AILIA_API ailiaTokenizerDecodeWithSpecialTokens | ( | struct AILIATokenizer * | net, |
| const int * | tokens, | ||
| unsigned int | token_count | ||
| ) |
Perform decode with special tokens.
| net | A tokenizer instance pointer |
| tokens | Tokens for decode |
| token_count | The number of tokens |
Get the decoded result with ailiaTokenizerGetText API. Similarly to skip_special_tokens=False, special tokens will be output.
| void AILIA_API ailiaTokenizerDestroy | ( | struct AILIATokenizer * | net | ) |
It destroys the tokenizer instance.
| net | A tokenizer instance pointer |
| int AILIA_API ailiaTokenizerEncode | ( | struct AILIATokenizer * | net, |
| const char * | utf8 | ||
| ) |
Perform encode.
| net | A tokenizer instance pointer |
| text | Text for encode (UTF8) |
Get the encoded result with ailiaTokenizerGetTokens API. Just like with split_special_tokens=True, Special Tokens are treated as strings by splitting them.
| int AILIA_API ailiaTokenizerEncodeWithSpecialTokens | ( | struct AILIATokenizer * | net, |
| const char * | utf8 | ||
| ) |
Perform encode with special tokens.
| net | A tokenizer instance pointer |
| text | Text for encode (UTF8) |
Get the encoded result with ailiaTokenizerGetTokens API. Similarly to split_special_tokens=False, special tokens will be output.
| int AILIA_API ailiaTokenizerGetCharEnds | ( | struct AILIATokenizer * | net, |
| int * | char_ends, | ||
| unsigned int | count | ||
| ) |
Gets the Char end positions.
| net | A tokenizer instance pointer |
| char_ends | Char end position |
| count | Token count |
If ailiaTokenizerEncode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE . This is valid only for AILIA_TOKENIZER_TYPE_ROBERTA and AILIA_TOKENIZER_TYPE_BERT. The character end positions in UTF-32 units corresponding to each token are returned.
| int AILIA_API ailiaTokenizerGetCharStarts | ( | struct AILIATokenizer * | net, |
| int * | char_starts, | ||
| unsigned int | count | ||
| ) |
Gets the Char start positions.
| net | A tokenizer instance pointer |
| char_starts | Character start position |
| count | Token count |
If ailiaTokenizerEncode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE . This is valid only for AILIA_TOKENIZER_TYPE_ROBERTA and AILIA_TOKENIZER_TYPE_BERT. The character start positions in UTF-32 units corresponding to each token are returned.
| int AILIA_API ailiaTokenizerGetText | ( | struct AILIATokenizer * | net, |
| char * | text, | ||
| unsigned int | len | ||
| ) |
Gets the decoded text.
| net | A tokenizer instance pointer |
| text | Text(UTF8) |
| len | Buffer size |
If ailiaTokenizerDecode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE .
| int AILIA_API ailiaTokenizerGetTextLength | ( | struct AILIATokenizer * | net, |
| unsigned int * | len | ||
| ) |
Gets the size of text. (Include null)
| net | A tokenizer instance pointer |
| len | The length of text |
| int AILIA_API ailiaTokenizerGetTokenCount | ( | struct AILIATokenizer * | net, |
| unsigned int * | count | ||
| ) |
Gets the number of tokens.
| net | A tokenizer instance pointer |
| count | The number of objects |
| int AILIA_API ailiaTokenizerGetTokens | ( | struct AILIATokenizer * | net, |
| int * | tokens, | ||
| unsigned int | count | ||
| ) |
Gets the tokens.
| net | A tokenizer instance pointer |
| tokens | Token |
| count | Token count |
If ailiaTokenizerEncode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE .
| int AILIA_API ailiaTokenizerGetVocab | ( | struct AILIATokenizer * | net, |
| int | token, | ||
| const char ** | vocab | ||
| ) |
Perform encode.
| net | A tokenizer instance pointer |
| token | Token |
| text | Text of vocab (UTF8) |
There is no need to release the vocab. The validity period of the vocab will last until the next time the ailiaTokenizer API is called.
| int AILIA_API ailiaTokenizerGetVocabSize | ( | struct AILIATokenizer * | net, |
| unsigned int * | size | ||
| ) |
Gets the size of vocab. (Include null)
| net | A tokenizer instance pointer |
| size | The size of vocab |
| int AILIA_API ailiaTokenizerGetWordIDs | ( | struct AILIATokenizer * | net, |
| int * | word_ids, | ||
| unsigned int | count | ||
| ) |
Gets the word ID.
| net | A tokenizer instance pointer |
| word_ids | Word ID |
| count | Token count |
If ailiaTokenizerEncode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE . This is valid only for AILIA_TOKENIZER_TYPE_ROBERTA and AILIA_TOKENIZER_TYPE_BERT.
| int AILIA_API ailiaTokenizerOpenAddedTokensFileA | ( | struct AILIATokenizer * | net, |
| const char * | path | ||
| ) |
Open added tokens file.
| net | A pointer to the tokenizer instance pointer |
| path | Path for special token file |
Open a added tokens file (json). This API only requires for AILIA_TOKENIZER_TYPE_WHISPER.
| int AILIA_API ailiaTokenizerOpenAddedTokensFileW | ( | struct AILIATokenizer * | net, |
| const wchar_t * | path | ||
| ) |
| int AILIA_API ailiaTokenizerOpenDictionaryFileA | ( | struct AILIATokenizer * | net, |
| const char * | path | ||
| ) |
Open dictionary file.
| net | A pointer to the tokenizer instance pointer |
| path | Path for dictionary of Mecab |
Open a model file for Mecab. This API only requires for AILIA_TOKENIZER_TYPE_BERT_JAPANESE_XXX.
| int AILIA_API ailiaTokenizerOpenDictionaryFileW | ( | struct AILIATokenizer * | net, |
| const wchar_t * | path | ||
| ) |
| int AILIA_API ailiaTokenizerOpenMergeFileA | ( | struct AILIATokenizer * | net, |
| const char * | path | ||
| ) |
Open merges file.
| net | A pointer to the tokenizer instance pointer |
| path | Path for merges file |
Open a merge file (txt). This API only requires for AILIA_TOKENIZER_TYPE_ROBERTA or AILIA_TOKENIZER_TYPE_WHISPER or AILIA_TOKENIZER_TYPE_GPT2.
| int AILIA_API ailiaTokenizerOpenMergeFileW | ( | struct AILIATokenizer * | net, |
| const wchar_t * | path | ||
| ) |
| int AILIA_API ailiaTokenizerOpenModelFileA | ( | struct AILIATokenizer * | net, |
| const char * | path | ||
| ) |
Open model file.
| net | A pointer to the tokenizer instance pointer |
| path | Path for SentencePiece |
Open a model file for SentencePiece. This API only requires for AILIA_TOKENIZER_TYPE_XLM_ROBERTA or AILIA_TOKENIZER_TYPE_MARIAN.
| int AILIA_API ailiaTokenizerOpenModelFileW | ( | struct AILIATokenizer * | net, |
| const wchar_t * | path | ||
| ) |
| int AILIA_API ailiaTokenizerOpenTokenizerConfigFileA | ( | struct AILIATokenizer * | net, |
| const char * | path | ||
| ) |
Open tokenizer config file.
| net | A pointer to the tokenizer instance pointer |
| path | Path for config file |
Open a tokenizer config file (json). This API only requires for AILIA_TOKENIZER_TYPE_BERT.
| int AILIA_API ailiaTokenizerOpenTokenizerConfigFileW | ( | struct AILIATokenizer * | net, |
| const wchar_t * | path | ||
| ) |
| int AILIA_API ailiaTokenizerOpenVocabFileA | ( | struct AILIATokenizer * | net, |
| const char * | path | ||
| ) |
Open vocab file.
| net | A pointer to the tokenizer instance pointer |
| path | Path for Vocab file |
Open a vocab file (json for ROBERTA or WHISPER or GPT2, txt for others).
| int AILIA_API ailiaTokenizerOpenVocabFileW | ( | struct AILIATokenizer * | net, |
| const wchar_t * | path | ||
| ) |
| int AILIA_API ailiaTokenizerUtf32ToUtf8 | ( | char * | utf8, |
| unsigned int * | processed_byte, | ||
| unsigned int | utf32 | ||
| ) |
Convert UTF32 character to UTF8 character.
| utf8 | UTF8 character(Require greater than 4byte) |
| processed_byte | Processed bytes on UTF8 |
| utf32 | UTF32 character |
| int AILIA_API ailiaTokenizerUtf8ToUtf32 | ( | unsigned int * | utf32, |
| unsigned int * | processed_byte, | ||
| const char * | utf8, | ||
| unsigned int | utf8_len | ||
| ) |
Convert UTF8 character to UTF32 character.
| utf32 | UTF32の文字 |
| processed_byte | Processed bytes on UTF8 |
| utf8 | UTF8の文字 |
| utf8_len | Buffer Size |