ailia_tokenizer
1.4.0.0
|
Public Member Functions | |
static int | ailiaTokenizerCreate (ref IntPtr net, int type, int flags) |
Creates a tokenizer instance. More... | |
static int | ailiaTokenizerOpenModelFile (IntPtr net, string ath) |
Open model file. More... | |
static int | ailiaTokenizerOpenDictionaryFile (IntPtr net, string path) |
Open dictionary file. More... | |
static int | ailiaTokenizerOpenVocabFile (IntPtr net, string path) |
Open vocab file. More... | |
static int | ailiaTokenizerOpenMergeFile (IntPtr net, string path) |
Open merges file. More... | |
static int | ailiaTokenizerOpenAddedTokensFile (IntPtr net, string path) |
Open added tokens file. More... | |
static int | ailiaTokenizerOpenTokenizerConfigFile (IntPtr net, string path) |
Open tokenizer config file. More... | |
static int | ailiaTokenizerEncode (IntPtr net, IntPtr utf8) |
Perform encode. More... | |
static int | ailiaTokenizerEncodeWithSpecialTokens (IntPtr net, IntPtr utf8) |
Perform encode with special tokens. More... | |
static int | ailiaTokenizerGetTokenCount (IntPtr net, ref uint count) |
Gets the number of tokens. More... | |
static int | ailiaTokenizerGetTokens (IntPtr net, IntPtr tokens, uint count) |
Gets the tokens. More... | |
static int | ailiaTokenizerGetWordIDs (IntPtr net, IntPtr tokens, uint count) |
Gets the word ID. More... | |
static int | ailiaTokenizerGetCharStarts (IntPtr net, IntPtr tokens, uint count) |
Gets the Char start positions. More... | |
static int | ailiaTokenizerGetCharEnds (IntPtr net, IntPtr tokens, uint count) |
Gets the Char end positions. More... | |
static int | ailiaTokenizerDecode (IntPtr net, IntPtr tokens, uint token_count) |
Perform encode. More... | |
static int | ailiaTokenizerDecodeWithSpecialTokens (IntPtr net, IntPtr tokens, uint token_count) |
Perform decode with special tokens. More... | |
static int | ailiaTokenizerGetTextLength (IntPtr net, ref uint len) |
Gets the size of text. (Include null) More... | |
static int | ailiaTokenizerGetText (IntPtr net, IntPtr text, uint len) |
Gets the decoded text. More... | |
static int | ailiaTokenizerGetVocabSize (IntPtr net, ref uint size) |
Gets the size of vocab. (Include null) More... | |
static int | ailiaTokenizerGetVocab (IntPtr net, int token, ref IntPtr vocab) |
Perform encode. More... | |
static int | ailiaTokenizerAddSpecialTokens (IntPtr net, IntPtr tokens, uint count) |
Add SpecialToken. More... | |
static void | ailiaTokenizerDestroy (IntPtr net) |
It destroys the tokenizer instance. More... | |
static int | ailiaTokenizerUtf8ToUtf32 (ref uint utf32, ref uint processed_byte, IntPtr utf8, uint utf8_len) |
Convert UTF8 character to UTF32 character. More... | |
static int | ailiaTokenizerUtf32ToUtf8 (IntPtr utf8, ref uint processed_byte, uint utf32) |
Convert UTF32 character to UTF8 character. More... | |
Static Public Attributes | |
const String | LIBRARY_NAME ="ailia_tokenizer" |
const Int32 | AILIA_TOKENIZER_TYPE_WHISPER = (0) |
const Int32 | AILIA_TOKENIZER_TYPE_CLIP = (1) |
const Int32 | AILIA_TOKENIZER_TYPE_XLM_ROBERTA = (2) |
const Int32 | AILIA_TOKENIZER_TYPE_MARIAN = (3) |
const Int32 | AILIA_TOKENIZER_TYPE_BERT_JAPANESE_WORDPIECE = (4) |
const Int32 | AILIA_TOKENIZER_TYPE_BERT_JAPANESE_CHARACTER = (5) |
const Int32 | AILIA_TOKENIZER_TYPE_T5 = (6) |
const Int32 | AILIA_TOKENIZER_TYPE_ROBERTA = (7) |
const Int32 | AILIA_TOKENIZER_TYPE_BERT = (8) |
const Int32 | AILIA_TOKENIZER_TYPE_GPT2 = (9) |
const Int32 | AILIA_TOKENIZER_TYPE_LLAMA = (10) |
const Int32 | AILIA_TOKENIZER_FLAG_NONE = (0) |
const Int32 | AILIA_TOKENIZER_FLAG_UTF8_SAFE = (1) |
static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerAddSpecialTokens | ( | IntPtr | net, |
IntPtr | tokens, | ||
uint | count | ||
) |
Add SpecialToken.
net | A tokenizer instance pointer |
tokens | Token(UTF8) |
count | The number of tokens |
This is valid only for AILIA_TOKENIZER_TYPE_ROBERTA and AILIA_TOKENIZER_TYPE_ROBERTA.
static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerCreate | ( | ref IntPtr | net, |
int | type, | ||
int | flags | ||
) |
Creates a tokenizer instance.
net | A pointer to the tokenizer instance pointer |
type | AILIA_TOKENIZER_TYPE_* |
flag | OR of AILIA_TOKENIZER_FLAG_* |
Creates a tokenizer instance.
static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerDecode | ( | IntPtr | net, |
IntPtr | tokens, | ||
uint | token_count | ||
) |
Perform encode.
net | A tokenizer instance pointer |
tokens | Tokens for decode |
token_count | The number of tokens |
Get the decoded result with ailiaTokenizerGetText API.
static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerDecodeWithSpecialTokens | ( | IntPtr | net, |
IntPtr | tokens, | ||
uint | token_count | ||
) |
Perform decode with special tokens.
net | A tokenizer instance pointer |
tokens | Tokens for decode |
token_count | The number of tokens |
Get the decoded result with ailiaTokenizerGetText API. Similarly to skip_special_tokens=False, special tokens will be output.
static void ailiaTokenizer.AiliaTokenizer.ailiaTokenizerDestroy | ( | IntPtr | net | ) |
It destroys the tokenizer instance.
net | A tokenizer instance pointer |
static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerEncode | ( | IntPtr | net, |
IntPtr | utf8 | ||
) |
Perform encode.
net | A tokenizer instance pointer |
text | Text for encode (UTF8) |
Get the encoded result with ailiaTokenizerGetTokens API.
static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerEncodeWithSpecialTokens | ( | IntPtr | net, |
IntPtr | utf8 | ||
) |
Perform encode with special tokens.
net | A tokenizer instance pointer |
text | Text for encode (UTF8) |
Get the encoded result with ailiaTokenizerGetTokens API. Similarly to split_special_tokens=False, special tokens will be output.
static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerGetCharEnds | ( | IntPtr | net, |
IntPtr | tokens, | ||
uint | count | ||
) |
Gets the Char end positions.
net | A tokenizer instance pointer |
char_ends | Char end position |
count | Token count |
If ailiaTokenizerEncode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE . This is valid only for AILIA_TOKENIZER_TYPE_ROBERTA and AILIA_TOKENIZER_TYPE_BERT. The character end positions in UTF-32 units corresponding to each token are returned.
static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerGetCharStarts | ( | IntPtr | net, |
IntPtr | tokens, | ||
uint | count | ||
) |
Gets the Char start positions.
net | A tokenizer instance pointer |
char_starts | Character start position |
count | Token count |
If ailiaTokenizerEncode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE . This is valid only for AILIA_TOKENIZER_TYPE_ROBERTA and AILIA_TOKENIZER_TYPE_BERT. The character start positions in UTF-32 units corresponding to each token are returned.
static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerGetText | ( | IntPtr | net, |
IntPtr | text, | ||
uint | len | ||
) |
Gets the decoded text.
net | A tokenizer instance pointer |
text | Text(UTF8) |
len | Buffer size |
If ailiaTokenizerDecode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE .
static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerGetTextLength | ( | IntPtr | net, |
ref uint | len | ||
) |
Gets the size of text. (Include null)
net | A tokenizer instance pointer |
len | The length of text |
static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerGetTokenCount | ( | IntPtr | net, |
ref uint | count | ||
) |
Gets the number of tokens.
net | A tokenizer instance pointer |
count | The number of objects |
static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerGetTokens | ( | IntPtr | net, |
IntPtr | tokens, | ||
uint | count | ||
) |
Gets the tokens.
net | A tokenizer instance pointer |
tokens | Token |
count | Token count |
If ailiaTokenizerEncode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE .
static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerGetVocab | ( | IntPtr | net, |
int | token, | ||
ref IntPtr | vocab | ||
) |
Perform encode.
net | A tokenizer instance pointer |
token | Token |
text | Text of vocab (UTF8) |
There is no need to release the vocab. The validity period of the vocab will last until the next time the ailiaTokenizer API is called.
static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerGetVocabSize | ( | IntPtr | net, |
ref uint | size | ||
) |
Gets the size of vocab. (Include null)
net | A tokenizer instance pointer |
size | The size of vocab |
static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerGetWordIDs | ( | IntPtr | net, |
IntPtr | tokens, | ||
uint | count | ||
) |
Gets the word ID.
net | A tokenizer instance pointer |
word_ids | Word ID |
count | Token count |
If ailiaTokenizerEncode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE . This is valid only for AILIA_TOKENIZER_TYPE_ROBERTA and AILIA_TOKENIZER_TYPE_BERT.
static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerOpenAddedTokensFile | ( | IntPtr | net, |
string | path | ||
) |
Open added tokens file.
net | A pointer to the tokenizer instance pointer |
path | Path for special token file |
Open a added tokens file (json). This API only requires for AILIA_TOKENIZER_TYPE_WHISPER.
static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerOpenDictionaryFile | ( | IntPtr | net, |
string | path | ||
) |
Open dictionary file.
net | A pointer to the tokenizer instance pointer |
path | Path for dictionary of Mecab |
Open a model file for Mecab. This API only requires for AILIA_TOKENIZER_TYPE_BERT_JAPANESE_XXX.
static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerOpenMergeFile | ( | IntPtr | net, |
string | path | ||
) |
Open merges file.
net | A pointer to the tokenizer instance pointer |
path | Path for merges file |
Open a merge file (txt). This API only requires for AILIA_TOKENIZER_TYPE_ROBERTA or AILIA_TOKENIZER_TYPE_WHISPER or AILIA_TOKENIZER_TYPE_GPT2.
static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerOpenModelFile | ( | IntPtr | net, |
string | ath | ||
) |
Open model file.
net | A pointer to the tokenizer instance pointer |
path | Path for SentencePiece |
Open a model file for SentencePiece. This API only requires for AILIA_TOKENIZER_TYPE_XLM_ROBERTA or AILIA_TOKENIZER_TYPE_MARIAN.
static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerOpenTokenizerConfigFile | ( | IntPtr | net, |
string | path | ||
) |
Open tokenizer config file.
net | A pointer to the tokenizer instance pointer |
path | Path for config file |
Open a tokenizer config file (json). This API only requires for AILIA_TOKENIZER_TYPE_BERT.
static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerOpenVocabFile | ( | IntPtr | net, |
string | path | ||
) |
Open vocab file.
net | A pointer to the tokenizer instance pointer |
path | Path for Vocab file |
Open a vocab file (json for ROBERTA or WHISPER or GPT2, txt for others).
static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerUtf32ToUtf8 | ( | IntPtr | utf8, |
ref uint | processed_byte, | ||
uint | utf32 | ||
) |
Convert UTF32 character to UTF8 character.
utf8 | UTF8 character(Require greater than 4byte) |
processed_byte | Processed bytes on UTF8 |
utf32 | UTF32 character |
static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerUtf8ToUtf32 | ( | ref uint | utf32, |
ref uint | processed_byte, | ||
IntPtr | utf8, | ||
uint | utf8_len | ||
) |
Convert UTF8 character to UTF32 character.
utf32 | UTF32の文字 |
processed_byte | Processed bytes on UTF8 |
utf8 | UTF8の文字 |
utf8_len | Buffer Size |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |