ailia_tokenizer  1.3.0.0
Public Member Functions | Static Public Attributes | List of all members
ailiaTokenizer.AiliaTokenizer Class Reference

Public Member Functions

static int ailiaTokenizerCreate (ref IntPtr net, int type, int flags)
 Creates a tokenizer instance. More...
 
static int ailiaTokenizerOpenModelFile (IntPtr net, string ath)
 Open model file. More...
 
static int ailiaTokenizerOpenDictionaryFile (IntPtr net, string path)
 Open dictionary file. More...
 
static int ailiaTokenizerOpenVocabFile (IntPtr net, string path)
 Open vocab file. More...
 
static int ailiaTokenizerOpenMergeFile (IntPtr net, string path)
 Open merges file. More...
 
static int ailiaTokenizerOpenAddedTokensFile (IntPtr net, string path)
 Open added tokens file. More...
 
static int ailiaTokenizerOpenTokenizerConfigFile (IntPtr net, string path)
 Open tokenizer config file. More...
 
static int ailiaTokenizerEncode (IntPtr net, IntPtr utf8)
 Perform encode. More...
 
static int ailiaTokenizerEncodeWithSpecialTokens (IntPtr net, IntPtr utf8)
 Perform encode with special tokens. More...
 
static int ailiaTokenizerGetTokenCount (IntPtr net, ref uint count)
 Gets the number of tokens. More...
 
static int ailiaTokenizerGetTokens (IntPtr net, IntPtr tokens, uint count)
 Gets the tokens. More...
 
static int ailiaTokenizerGetWordIDs (IntPtr net, IntPtr tokens, uint count)
 Gets the word ID. More...
 
static int ailiaTokenizerGetCharStarts (IntPtr net, IntPtr tokens, uint count)
 Gets the Char start positions. More...
 
static int ailiaTokenizerGetCharEnds (IntPtr net, IntPtr tokens, uint count)
 Gets the Char end positions. More...
 
static int ailiaTokenizerDecode (IntPtr net, IntPtr tokens, uint token_count)
 Perform encode. More...
 
static int ailiaTokenizerDecodeWithSpecialTokens (IntPtr net, IntPtr tokens, uint token_count)
 Perform decode with special tokens. More...
 
static int ailiaTokenizerGetTextLength (IntPtr net, ref uint len)
 Gets the size of text. (Include null) More...
 
static int ailiaTokenizerGetText (IntPtr net, IntPtr text, uint len)
 Gets the decoded text. More...
 
static int ailiaTokenizerGetVocabSize (IntPtr net, ref uint size)
 Gets the size of vocab. (Include null) More...
 
static int ailiaTokenizerGetVocab (IntPtr net, int token, ref IntPtr vocab)
 Perform encode. More...
 
static void ailiaTokenizerDestroy (IntPtr net)
 It destroys the tokenizer instance. More...
 
static int ailiaTokenizerUtf8ToUtf32 (ref uint utf32, ref uint processed_byte, IntPtr utf8, uint utf8_len)
 Convert UTF8 character to UTF32 character. More...
 
static int ailiaTokenizerUtf32ToUtf8 (IntPtr utf8, ref uint processed_byte, uint utf32)
 Convert UTF32 character to UTF8 character. More...
 

Static Public Attributes

const String LIBRARY_NAME ="ailia_tokenizer"
 
const Int32 AILIA_TOKENIZER_TYPE_WHISPER = (0)
 
const Int32 AILIA_TOKENIZER_TYPE_CLIP = (1)
 
const Int32 AILIA_TOKENIZER_TYPE_XLM_ROBERTA = (2)
 
const Int32 AILIA_TOKENIZER_TYPE_MARIAN = (3)
 
const Int32 AILIA_TOKENIZER_TYPE_BERT_JAPANESE_WORDPIECE = (4)
 
const Int32 AILIA_TOKENIZER_TYPE_BERT_JAPANESE_CHARACTER = (5)
 
const Int32 AILIA_TOKENIZER_TYPE_T5 = (6)
 
const Int32 AILIA_TOKENIZER_TYPE_ROBERTA = (7)
 
const Int32 AILIA_TOKENIZER_TYPE_BERT = (8)
 
const Int32 AILIA_TOKENIZER_TYPE_GPT2 = (9)
 
const Int32 AILIA_TOKENIZER_TYPE_LLAMA = (10)
 
const Int32 AILIA_TOKENIZER_FLAG_NONE = (0)
 
const Int32 AILIA_TOKENIZER_FLAG_UTF8_SAFE = (1)
 

Member Function Documentation

◆ ailiaTokenizerCreate()

static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerCreate ( ref IntPtr  net,
int  type,
int  flags 
)

Creates a tokenizer instance.

Parameters
netA pointer to the tokenizer instance pointer
typeAILIA_TOKENIZER_TYPE_*
flagOR of AILIA_TOKENIZER_FLAG_*
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

Creates a tokenizer instance.

◆ ailiaTokenizerDecode()

static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerDecode ( IntPtr  net,
IntPtr  tokens,
uint  token_count 
)

Perform encode.

Parameters
netA tokenizer instance pointer
tokensTokens for decode
token_countThe number of tokens
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

Get the decoded result with ailiaTokenizerGetText API.

◆ ailiaTokenizerDecodeWithSpecialTokens()

static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerDecodeWithSpecialTokens ( IntPtr  net,
IntPtr  tokens,
uint  token_count 
)

Perform decode with special tokens.

Parameters
netA tokenizer instance pointer
tokensTokens for decode
token_countThe number of tokens
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

Get the decoded result with ailiaTokenizerGetText API. Similarly to skip_special_tokens=False, special tokens will be output.

◆ ailiaTokenizerDestroy()

static void ailiaTokenizer.AiliaTokenizer.ailiaTokenizerDestroy ( IntPtr  net)

It destroys the tokenizer instance.

Parameters
netA tokenizer instance pointer

◆ ailiaTokenizerEncode()

static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerEncode ( IntPtr  net,
IntPtr  utf8 
)

Perform encode.

Parameters
netA tokenizer instance pointer
textText for encode (UTF8)
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

Get the encoded result with ailiaTokenizerGetTokens API.

◆ ailiaTokenizerEncodeWithSpecialTokens()

static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerEncodeWithSpecialTokens ( IntPtr  net,
IntPtr  utf8 
)

Perform encode with special tokens.

Parameters
netA tokenizer instance pointer
textText for encode (UTF8)
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

Get the encoded result with ailiaTokenizerGetTokens API. Similarly to split_special_tokens=False, special tokens will be output.

◆ ailiaTokenizerGetCharEnds()

static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerGetCharEnds ( IntPtr  net,
IntPtr  tokens,
uint  count 
)

Gets the Char end positions.

Parameters
netA tokenizer instance pointer
char_endsChar end position
countToken count
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

If ailiaTokenizerEncode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE . This is valid only for AILIA_TOKENIZER_TYPE_ROBERTA and AILIA_TOKENIZER_TYPE_BERT. The character end positions in UTF-32 units corresponding to each token are returned.

◆ ailiaTokenizerGetCharStarts()

static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerGetCharStarts ( IntPtr  net,
IntPtr  tokens,
uint  count 
)

Gets the Char start positions.

Parameters
netA tokenizer instance pointer
char_startsCharacter start position
countToken count
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

If ailiaTokenizerEncode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE . This is valid only for AILIA_TOKENIZER_TYPE_ROBERTA and AILIA_TOKENIZER_TYPE_BERT. The character start positions in UTF-32 units corresponding to each token are returned.

◆ ailiaTokenizerGetText()

static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerGetText ( IntPtr  net,
IntPtr  text,
uint  len 
)

Gets the decoded text.

Parameters
netA tokenizer instance pointer
textText(UTF8)
lenBuffer size
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

If ailiaTokenizerDecode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE .

◆ ailiaTokenizerGetTextLength()

static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerGetTextLength ( IntPtr  net,
ref uint  len 
)

Gets the size of text. (Include null)

Parameters
netA tokenizer instance pointer
lenThe length of text
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

◆ ailiaTokenizerGetTokenCount()

static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerGetTokenCount ( IntPtr  net,
ref uint  count 
)

Gets the number of tokens.

Parameters
netA tokenizer instance pointer
countThe number of objects
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

◆ ailiaTokenizerGetTokens()

static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerGetTokens ( IntPtr  net,
IntPtr  tokens,
uint  count 
)

Gets the tokens.

Parameters
netA tokenizer instance pointer
tokensToken
countToken count
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

If ailiaTokenizerEncode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE .

◆ ailiaTokenizerGetVocab()

static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerGetVocab ( IntPtr  net,
int  token,
ref IntPtr  vocab 
)

Perform encode.

Parameters
netA tokenizer instance pointer
tokenToken
textText of vocab (UTF8)
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

There is no need to release the vocab. The validity period of the vocab will last until the next time the ailiaTokenizer API is called.

◆ ailiaTokenizerGetVocabSize()

static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerGetVocabSize ( IntPtr  net,
ref uint  size 
)

Gets the size of vocab. (Include null)

Parameters
netA tokenizer instance pointer
sizeThe size of vocab
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

◆ ailiaTokenizerGetWordIDs()

static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerGetWordIDs ( IntPtr  net,
IntPtr  tokens,
uint  count 
)

Gets the word ID.

Parameters
netA tokenizer instance pointer
word_idsWord ID
countToken count
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

If ailiaTokenizerEncode() is not run at all, the function returns AILIA_STATUS_INVALID_STATE . This is valid only for AILIA_TOKENIZER_TYPE_ROBERTA and AILIA_TOKENIZER_TYPE_BERT.

◆ ailiaTokenizerOpenAddedTokensFile()

static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerOpenAddedTokensFile ( IntPtr  net,
string  path 
)

Open added tokens file.

Parameters
netA pointer to the tokenizer instance pointer
pathPath for special token file
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

Open a added tokens file (json). This API only requires for AILIA_TOKENIZER_TYPE_WHISPER.

◆ ailiaTokenizerOpenDictionaryFile()

static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerOpenDictionaryFile ( IntPtr  net,
string  path 
)

Open dictionary file.

Parameters
netA pointer to the tokenizer instance pointer
pathPath for dictionary of Mecab
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

Open a model file for Mecab. This API only requires for AILIA_TOKENIZER_TYPE_BERT_JAPANESE_XXX.

◆ ailiaTokenizerOpenMergeFile()

static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerOpenMergeFile ( IntPtr  net,
string  path 
)

Open merges file.

Parameters
netA pointer to the tokenizer instance pointer
pathPath for merges file
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

Open a merge file (txt). This API only requires for AILIA_TOKENIZER_TYPE_ROBERTA or AILIA_TOKENIZER_TYPE_WHISPER or AILIA_TOKENIZER_TYPE_GPT2.

◆ ailiaTokenizerOpenModelFile()

static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerOpenModelFile ( IntPtr  net,
string  ath 
)

Open model file.

Parameters
netA pointer to the tokenizer instance pointer
pathPath for SentencePiece
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

Open a model file for SentencePiece. This API only requires for AILIA_TOKENIZER_TYPE_XLM_ROBERTA or AILIA_TOKENIZER_TYPE_MARIAN.

◆ ailiaTokenizerOpenTokenizerConfigFile()

static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerOpenTokenizerConfigFile ( IntPtr  net,
string  path 
)

Open tokenizer config file.

Parameters
netA pointer to the tokenizer instance pointer
pathPath for config file
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

Open a tokenizer config file (json). This API only requires for AILIA_TOKENIZER_TYPE_BERT.

◆ ailiaTokenizerOpenVocabFile()

static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerOpenVocabFile ( IntPtr  net,
string  path 
)

Open vocab file.

Parameters
netA pointer to the tokenizer instance pointer
pathPath for Vocab file
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

Open a vocab file (json for ROBERTA or WHISPER or GPT2, txt for others).

◆ ailiaTokenizerUtf32ToUtf8()

static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerUtf32ToUtf8 ( IntPtr  utf8,
ref uint  processed_byte,
uint  utf32 
)

Convert UTF32 character to UTF8 character.

Parameters
utf8UTF8 character(Require greater than 4byte)
processed_byteProcessed bytes on UTF8
utf32UTF32 character
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

◆ ailiaTokenizerUtf8ToUtf32()

static int ailiaTokenizer.AiliaTokenizer.ailiaTokenizerUtf8ToUtf32 ( ref uint  utf32,
ref uint  processed_byte,
IntPtr  utf8,
uint  utf8_len 
)

Convert UTF8 character to UTF32 character.

Parameters
utf32UTF32の文字
processed_byteProcessed bytes on UTF8
utf8UTF8の文字
utf8_lenBuffer Size
Returns
If this function is successful, it returns AILIA_STATUS_SUCCESS , or an error code otherwise.

Member Data Documentation

◆ AILIA_TOKENIZER_FLAG_NONE

const Int32 ailiaTokenizer.AiliaTokenizer.AILIA_TOKENIZER_FLAG_NONE = (0)
static

◆ AILIA_TOKENIZER_FLAG_UTF8_SAFE

const Int32 ailiaTokenizer.AiliaTokenizer.AILIA_TOKENIZER_FLAG_UTF8_SAFE = (1)
static

◆ AILIA_TOKENIZER_TYPE_BERT

const Int32 ailiaTokenizer.AiliaTokenizer.AILIA_TOKENIZER_TYPE_BERT = (8)
static

◆ AILIA_TOKENIZER_TYPE_BERT_JAPANESE_CHARACTER

const Int32 ailiaTokenizer.AiliaTokenizer.AILIA_TOKENIZER_TYPE_BERT_JAPANESE_CHARACTER = (5)
static

◆ AILIA_TOKENIZER_TYPE_BERT_JAPANESE_WORDPIECE

const Int32 ailiaTokenizer.AiliaTokenizer.AILIA_TOKENIZER_TYPE_BERT_JAPANESE_WORDPIECE = (4)
static

◆ AILIA_TOKENIZER_TYPE_CLIP

const Int32 ailiaTokenizer.AiliaTokenizer.AILIA_TOKENIZER_TYPE_CLIP = (1)
static

◆ AILIA_TOKENIZER_TYPE_GPT2

const Int32 ailiaTokenizer.AiliaTokenizer.AILIA_TOKENIZER_TYPE_GPT2 = (9)
static

◆ AILIA_TOKENIZER_TYPE_LLAMA

const Int32 ailiaTokenizer.AiliaTokenizer.AILIA_TOKENIZER_TYPE_LLAMA = (10)
static

◆ AILIA_TOKENIZER_TYPE_MARIAN

const Int32 ailiaTokenizer.AiliaTokenizer.AILIA_TOKENIZER_TYPE_MARIAN = (3)
static

◆ AILIA_TOKENIZER_TYPE_ROBERTA

const Int32 ailiaTokenizer.AiliaTokenizer.AILIA_TOKENIZER_TYPE_ROBERTA = (7)
static

◆ AILIA_TOKENIZER_TYPE_T5

const Int32 ailiaTokenizer.AiliaTokenizer.AILIA_TOKENIZER_TYPE_T5 = (6)
static

◆ AILIA_TOKENIZER_TYPE_WHISPER

const Int32 ailiaTokenizer.AiliaTokenizer.AILIA_TOKENIZER_TYPE_WHISPER = (0)
static

◆ AILIA_TOKENIZER_TYPE_XLM_ROBERTA

const Int32 ailiaTokenizer.AiliaTokenizer.AILIA_TOKENIZER_TYPE_XLM_ROBERTA = (2)
static

◆ LIBRARY_NAME

const String ailiaTokenizer.AiliaTokenizer.LIBRARY_NAME ="ailia_tokenizer"
static

The documentation for this class was generated from the following file: