|
ailia_tokenizer
1.5.0.0
|
Public Member Functions | |
| static int | ailiaTokenizerCreate (ref IntPtr net, int type, int flags) |
| Creates a tokenizer instance. More... | |
| static int | ailiaTokenizerOpenModelFile (IntPtr net, string ath) |
| Open model file. More... | |
| static int | ailiaTokenizerOpenDictionaryFile (IntPtr net, string path) |
| Open dictionary file. More... | |
| static int | ailiaTokenizerOpenVocabFile (IntPtr net, string path) |
| Open vocab file. More... | |
| static int | ailiaTokenizerOpenMergeFile (IntPtr net, string path) |
| Open merges file. More... | |
| static int | ailiaTokenizerOpenAddedTokensFile (IntPtr net, string path) |
| Open added tokens file. More... | |
| static int | ailiaTokenizerOpenTokenizerConfigFile (IntPtr net, string path) |
| Open tokenizer config file. More... | |
| static int | ailiaTokenizerEncode (IntPtr net, IntPtr utf8) |
| Perform encode. More... | |
| static int | ailiaTokenizerEncodeWithSpecialTokens (IntPtr net, IntPtr utf8) |
| Perform encode with special tokens. More... | |
| static int | ailiaTokenizerGetTokenCount (IntPtr net, ref uint count) |
| Gets the number of tokens. More... | |
| static int | ailiaTokenizerGetTokens (IntPtr net, IntPtr tokens, uint count) |
| Gets the tokens. More... | |
| static int | ailiaTokenizerGetWordIDs (IntPtr net, IntPtr tokens, uint count) |
| Gets the word ID. More... | |
| static int | ailiaTokenizerGetCharStarts (IntPtr net, IntPtr tokens, uint count) |
| Gets the Char start positions. More... | |
| static int | ailiaTokenizerGetCharEnds (IntPtr net, IntPtr tokens, uint count) |
| Gets the Char end positions. More... | |
| static int | ailiaTokenizerDecode (IntPtr net, IntPtr tokens, uint token_count) |
| Perform encode. More... | |
| static int | ailiaTokenizerDecodeWithSpecialTokens (IntPtr net, IntPtr tokens, uint token_count) |
| Perform decode with special tokens. More... | |
| static int | ailiaTokenizerGetTextLength (IntPtr net, ref uint len) |
| Gets the size of text. (Include null) More... | |
| static int | ailiaTokenizerGetText (IntPtr net, IntPtr text, uint len) |
| Gets the decoded text. More... | |
| static int | ailiaTokenizerGetVocabSize (IntPtr net, ref uint size) |
| Gets the size of vocab. (Include null) More... | |
| static int | ailiaTokenizerGetVocab (IntPtr net, int token, ref IntPtr vocab) |
| Perform encode. More... | |
| static int | ailiaTokenizerAddSpecialTokens (IntPtr net, IntPtr tokens, uint count) |
| Add SpecialToken. More... | |
| static void | ailiaTokenizerDestroy (IntPtr net) |
| It destroys the tokenizer instance. More... | |
| static int | ailiaTokenizerUtf8ToUtf32 (ref uint utf32, ref uint processed_byte, IntPtr utf8, uint utf8_len) |
| Convert UTF8 character to UTF32 character. More... | |
| static int | ailiaTokenizerUtf32ToUtf8 (IntPtr utf8, ref uint processed_byte, uint utf32) |
| Convert UTF32 character to UTF8 character. More... | |
Static Public Attributes | |
| const String | LIBRARY_NAME ="ailia_tokenizer" |
| const Int32 | AILIA_TOKENIZER_TYPE_WHISPER = (0) |
| const Int32 | AILIA_TOKENIZER_TYPE_CLIP = (1) |
| const Int32 | AILIA_TOKENIZER_TYPE_XLM_ROBERTA = (2) |
| const Int32 | AILIA_TOKENIZER_TYPE_MARIAN = (3) |
| const Int32 | AILIA_TOKENIZER_TYPE_BERT_JAPANESE_WORDPIECE = (4) |
| const Int32 | AILIA_TOKENIZER_TYPE_BERT_JAPANESE_CHARACTER = (5) |
| const Int32 | AILIA_TOKENIZER_TYPE_T5 = (6) |
| const Int32 | AILIA_TOKENIZER_TYPE_ROBERTA = (7) |
| const Int32 | AILIA_TOKENIZER_TYPE_BERT = (8) |
| const Int32 | AILIA_TOKENIZER_TYPE_GPT2 = (9) |
| const Int32 | AILIA_TOKENIZER_TYPE_LLAMA = (10) |
| const Int32 | AILIA_TOKENIZER_FLAG_NONE = (0) |
| const Int32 | AILIA_TOKENIZER_FLAG_UTF8_SAFE = (1) |
| static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerAddSpecialTokens | ( | IntPtr | net, |
| IntPtr | tokens, | ||
| uint | count | ||
| ) |
Add SpecialToken.
| net | A tokenizer instance pointer |
| tokens | Token(UTF8) |
| count | The number of tokens |
This is valid only for AILIA_TOKENIZER_TYPE_ROBERTA and AILIA_TOKENIZER_TYPE_GPT2.
| static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerCreate | ( | ref IntPtr | net, |
| int | type, | ||
| int | flags | ||
| ) |
Creates a tokenizer instance.
| net | A pointer to the tokenizer instance pointer |
| type | AILIA_TOKENIZER_TYPE_* |
| flag | OR of AILIA_TOKENIZER_FLAG_* |
Creates a tokenizer instance.
| static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerDecode | ( | IntPtr | net, |
| IntPtr | tokens, | ||
| uint | token_count | ||
| ) |
Perform encode.
| net | A tokenizer instance pointer |
| tokens | Tokens for decode |
| token_count | The number of tokens |
Get the decoded result with ailiaTokenizerGetText API.
| static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerDecodeWithSpecialTokens | ( | IntPtr | net, |
| IntPtr | tokens, | ||
| uint | token_count | ||
| ) |
Perform decode with special tokens.
| net | A tokenizer instance pointer |
| tokens | Tokens for decode |
| token_count | The number of tokens |
Get the decoded result with ailiaTokenizerGetText API. Similarly to skip_special_tokens=False, special tokens will be output.
| static void ailiaTokenizer.AiliaTokenizer.ailiaTokenizerDestroy | ( | IntPtr | net | ) |
It destroys the tokenizer instance.
| net | A tokenizer instance pointer |
| static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerEncode | ( | IntPtr | net, |
| IntPtr | utf8 | ||
| ) |
Perform encode.
| net | A tokenizer instance pointer |
| text | Text for encode (UTF8) |
Get the encoded result with ailiaTokenizerGetTokens API.
| static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerEncodeWithSpecialTokens | ( | IntPtr | net, |
| IntPtr | utf8 | ||
| ) |
Perform encode with special tokens.
| net | A tokenizer instance pointer |
| text | Text for encode (UTF8) |
Get the encoded result with ailiaTokenizerGetTokens API. Similarly to split_special_tokens=False, special tokens will be output.
| static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerGetCharEnds | ( | IntPtr | net, |
| IntPtr | tokens, | ||
| uint | count | ||
| ) |
Gets the Char end positions.
| net | A tokenizer instance pointer |
| char_ends | Char end position |
| count | Token count |
If ailiaTokenizerEncode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE . This is valid only for AILIA_TOKENIZER_TYPE_ROBERTA and AILIA_TOKENIZER_TYPE_BERT. The character end positions in UTF-32 units corresponding to each token are returned.
| static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerGetCharStarts | ( | IntPtr | net, |
| IntPtr | tokens, | ||
| uint | count | ||
| ) |
Gets the Char start positions.
| net | A tokenizer instance pointer |
| char_starts | Character start position |
| count | Token count |
If ailiaTokenizerEncode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE . This is valid only for AILIA_TOKENIZER_TYPE_ROBERTA and AILIA_TOKENIZER_TYPE_BERT. The character start positions in UTF-32 units corresponding to each token are returned.
| static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerGetText | ( | IntPtr | net, |
| IntPtr | text, | ||
| uint | len | ||
| ) |
Gets the decoded text.
| net | A tokenizer instance pointer |
| text | Text(UTF8) |
| len | Buffer size |
If ailiaTokenizerDecode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE .
| static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerGetTextLength | ( | IntPtr | net, |
| ref uint | len | ||
| ) |
Gets the size of text. (Include null)
| net | A tokenizer instance pointer |
| len | The length of text |
| static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerGetTokenCount | ( | IntPtr | net, |
| ref uint | count | ||
| ) |
Gets the number of tokens.
| net | A tokenizer instance pointer |
| count | The number of objects |
| static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerGetTokens | ( | IntPtr | net, |
| IntPtr | tokens, | ||
| uint | count | ||
| ) |
Gets the tokens.
| net | A tokenizer instance pointer |
| tokens | Token |
| count | Token count |
If ailiaTokenizerEncode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE .
| static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerGetVocab | ( | IntPtr | net, |
| int | token, | ||
| ref IntPtr | vocab | ||
| ) |
Perform encode.
| net | A tokenizer instance pointer |
| token | Token |
| text | Text of vocab (UTF8) |
There is no need to release the vocab. The validity period of the vocab will last until the next time the ailiaTokenizer API is called.
| static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerGetVocabSize | ( | IntPtr | net, |
| ref uint | size | ||
| ) |
Gets the size of vocab. (Include null)
| net | A tokenizer instance pointer |
| size | The size of vocab |
| static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerGetWordIDs | ( | IntPtr | net, |
| IntPtr | tokens, | ||
| uint | count | ||
| ) |
Gets the word ID.
| net | A tokenizer instance pointer |
| word_ids | Word ID |
| count | Token count |
If ailiaTokenizerEncode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE . This is valid only for AILIA_TOKENIZER_TYPE_ROBERTA and AILIA_TOKENIZER_TYPE_BERT.
| static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerOpenAddedTokensFile | ( | IntPtr | net, |
| string | path | ||
| ) |
Open added tokens file.
| net | A pointer to the tokenizer instance pointer |
| path | Path for special token file |
Open a added tokens file (json). This API only requires for AILIA_TOKENIZER_TYPE_WHISPER.
| static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerOpenDictionaryFile | ( | IntPtr | net, |
| string | path | ||
| ) |
Open dictionary file.
| net | A pointer to the tokenizer instance pointer |
| path | Path for dictionary of Mecab |
Open a model file for Mecab. This API only requires for AILIA_TOKENIZER_TYPE_BERT_JAPANESE_XXX.
| static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerOpenMergeFile | ( | IntPtr | net, |
| string | path | ||
| ) |
Open merges file.
| net | A pointer to the tokenizer instance pointer |
| path | Path for merges file |
Open a merge file (txt). This API only requires for AILIA_TOKENIZER_TYPE_ROBERTA or AILIA_TOKENIZER_TYPE_WHISPER or AILIA_TOKENIZER_TYPE_GPT2.
| static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerOpenModelFile | ( | IntPtr | net, |
| string | ath | ||
| ) |
Open model file.
| net | A pointer to the tokenizer instance pointer |
| path | Path for SentencePiece |
Open a model file for SentencePiece. This API only requires for AILIA_TOKENIZER_TYPE_XLM_ROBERTA or AILIA_TOKENIZER_TYPE_MARIAN.
| static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerOpenTokenizerConfigFile | ( | IntPtr | net, |
| string | path | ||
| ) |
Open tokenizer config file.
| net | A pointer to the tokenizer instance pointer |
| path | Path for config file |
Open a tokenizer config file (json). This API only requires for AILIA_TOKENIZER_TYPE_BERT.
| static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerOpenVocabFile | ( | IntPtr | net, |
| string | path | ||
| ) |
Open vocab file.
| net | A pointer to the tokenizer instance pointer |
| path | Path for Vocab file |
Open a vocab file (json for ROBERTA or WHISPER or GPT2, txt for others).
| static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerUtf32ToUtf8 | ( | IntPtr | utf8, |
| ref uint | processed_byte, | ||
| uint | utf32 | ||
| ) |
Convert UTF32 character to UTF8 character.
| utf8 | UTF8 character(Require greater than 4byte) |
| processed_byte | Processed bytes on UTF8 |
| utf32 | UTF32 character |
| static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerUtf8ToUtf32 | ( | ref uint | utf32, |
| ref uint | processed_byte, | ||
| IntPtr | utf8, | ||
| uint | utf8_len | ||
| ) |
Convert UTF8 character to UTF32 character.
| utf32 | UTF32の文字 |
| processed_byte | Processed bytes on UTF8 |
| utf8 | UTF8の文字 |
| utf8_len | Buffer Size |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |