ailia_tokenizer  1.3.0.0
ailia_tokenizer.h
Go to the documentation of this file.
1 
9 #ifndef INCLUDED_AILIA_TOKENIZER
10 #define INCLUDED_AILIA_TOKENIZER
11 
12 /* エラーコードと構造体 */
13 
14 #include "ailia.h"
15 
16 /* 呼び出し規約 */
17 
18 #if defined(_WIN64) || defined(_M_X64) || defined(__amd64__) || defined(__x86_64__) || defined(__APPLE__) || \
19  defined(__ANDROID__) || defined(ANDROID) || defined(__linux__) || defined(NN_NINTENDO_SDK)
20 #define AILIA_API
21 #else
22 #define AILIA_API __stdcall
23 #endif
24 
25 #ifdef __cplusplus
26 extern "C" {
27 #endif
28 
29 /****************************************************************
30  * アルゴリズム定義
31  **/
32 
42 #define AILIA_TOKENIZER_TYPE_WHISPER (0)
43 
53 #define AILIA_TOKENIZER_TYPE_CLIP (1)
54 
64 #define AILIA_TOKENIZER_TYPE_XLM_ROBERTA (2)
65 
75 #define AILIA_TOKENIZER_TYPE_MARIAN (3)
76 
90 #define AILIA_TOKENIZER_TYPE_BERT_JAPANESE_WORDPIECE (4)
91 
105 #define AILIA_TOKENIZER_TYPE_BERT_JAPANESE_CHARACTER (5)
106 
122 #define AILIA_TOKENIZER_TYPE_T5 (6)
123 
133 #define AILIA_TOKENIZER_TYPE_ROBERTA (7)
134 
144 #define AILIA_TOKENIZER_TYPE_BERT (8)
145 
155 #define AILIA_TOKENIZER_TYPE_GPT2 (9)
156 
166 #define AILIA_TOKENIZER_TYPE_LLAMA (10)
167 
168 /****************************************************************
169  * フラグ定義
170  **/
171 
181 #define AILIA_TOKENIZER_FLAG_NONE (0)
182 
192 #define AILIA_TOKENIZER_FLAG_UTF8_SAFE (1)
193 
194 /****************************************************************
195  * トークナイズオブジェクトのインスタンス
196  **/
197 
198 struct AILIATokenizer;
199 
200 /****************************************************************
201  * Tokenizer API
202  **/
203 
225 int AILIA_API ailiaTokenizerCreate(struct AILIATokenizer** net, int type, int flags);
226 
246 int AILIA_API ailiaTokenizerOpenModelFileA(struct AILIATokenizer* net, const char *path);
247 int AILIA_API ailiaTokenizerOpenModelFileW(struct AILIATokenizer* net, const wchar_t *path);
248 
268 int AILIA_API ailiaTokenizerOpenDictionaryFileA(struct AILIATokenizer* net, const char *path);
269 int AILIA_API ailiaTokenizerOpenDictionaryFileW(struct AILIATokenizer* net, const wchar_t *path);
270 
290 int AILIA_API ailiaTokenizerOpenVocabFileA(struct AILIATokenizer* net, const char *path);
291 int AILIA_API ailiaTokenizerOpenVocabFileW(struct AILIATokenizer* net, const wchar_t *path);
292 
312 int AILIA_API ailiaTokenizerOpenMergeFileA(struct AILIATokenizer* net, const char *path);
313 int AILIA_API ailiaTokenizerOpenMergeFileW(struct AILIATokenizer* net, const wchar_t *path);
314 
334 int AILIA_API ailiaTokenizerOpenAddedTokensFileA(struct AILIATokenizer* net, const char *path);
335 int AILIA_API ailiaTokenizerOpenAddedTokensFileW(struct AILIATokenizer* net, const wchar_t *path);
336 
356 int AILIA_API ailiaTokenizerOpenTokenizerConfigFileA(struct AILIATokenizer* net, const char *path);
357 int AILIA_API ailiaTokenizerOpenTokenizerConfigFileW(struct AILIATokenizer* net, const wchar_t *path);
358 
380 int AILIA_API
381 ailiaTokenizerEncode(struct AILIATokenizer* net, const char *utf8);
382 
404 int AILIA_API
405 ailiaTokenizerEncodeWithSpecialTokens(struct AILIATokenizer* net, const char *utf8);
406 
422 int AILIA_API ailiaTokenizerGetTokenCount(struct AILIATokenizer* net, unsigned int* count);
423 
446 int AILIA_API ailiaTokenizerGetTokens(struct AILIATokenizer* net, int* tokens, unsigned int count);
447 
472 int AILIA_API ailiaTokenizerGetWordIDs(struct AILIATokenizer* net, int* word_ids, unsigned int count);
473 
500 int AILIA_API ailiaTokenizerGetCharStarts(struct AILIATokenizer* net, int* char_starts, unsigned int count);
501 
528 int AILIA_API ailiaTokenizerGetCharEnds(struct AILIATokenizer* net, int* char_ends, unsigned int count);
529 
553 int AILIA_API
554 ailiaTokenizerDecode(struct AILIATokenizer* net, const int *tokens, unsigned int token_count);
555 
579 int AILIA_API
580 ailiaTokenizerDecodeWithSpecialTokens(struct AILIATokenizer* net, const int *tokens, unsigned int token_count);
581 
597 int AILIA_API ailiaTokenizerGetTextLength(struct AILIATokenizer* net, unsigned int* len);
598 
620 int AILIA_API ailiaTokenizerGetText(struct AILIATokenizer* net, char* text, unsigned int len);
621 
637 int AILIA_API ailiaTokenizerGetVocabSize(struct AILIATokenizer* net, unsigned int* size);
638 
662 int AILIA_API
663 ailiaTokenizerGetVocab(struct AILIATokenizer* net, int token, const char **vocab);
664 
674 void AILIA_API ailiaTokenizerDestroy(struct AILIATokenizer* net);
675 
676 /****************************************************************
677  * Utility API
678  **/
679 
699 int AILIA_API ailiaTokenizerUtf8ToUtf32(unsigned int* utf32, unsigned int* processed_byte, const char* utf8, unsigned int utf8_len);
700 
718 int AILIA_API ailiaTokenizerUtf32ToUtf8(char* utf8, unsigned int* processed_byte, unsigned int utf32);
719 
720 #ifdef UNICODE
721 #define ailiaTokenizerOpenModelFile ailiaTokenizerOpenModelFileW
722 #define ailiaTokenizerOpenDictionaryFile ailiaTokenizerOpenDictionaryFileW
723 #define ailiaTokenizerOpenVocabFile ailiaTokenizerOpenVocabFileW
724 #define ailiaTokenizerOpenMergeFile ailiaTokenizerOpenMergeFileW
725 #define ailiaTokenizerOpenAddedTokensFile ailiaTokenizerOpenAddedTokensFileW
726 #define ailiaTokenizerOpenConfigFile ailiaTokenizerOpenConfigFileW
727 #else
728 #define ailiaTokenizerOpenModelFile ailiaTokenizerOpenModelFileA
729 #define ailiaTokenizerOpenDictionaryFile ailiaTokenizerOpenDictionaryFileA
730 #define ailiaTokenizerOpenVocabFile ailiaTokenizerOpenVocabFileA
731 #define ailiaTokenizerOpenMergeFile ailiaTokenizerOpenMergeFileA
732 #define ailiaTokenizerOpenAddedTokensFile ailiaTokenizerOpenAddedTokensFileA
733 #define ailiaTokenizerOpenTokenizerConfigFile ailiaTokenizerOpenTokenizerConfigFileA
734 #endif
735 
736 #ifdef __cplusplus
737 }
738 #endif
739 #endif /* !defined(INCLUDED_AILIA_TOKENIZER) */
ailiaTokenizerGetTextLength
int AILIA_API ailiaTokenizerGetTextLength(struct AILIATokenizer *net, unsigned int *len)
Gets the size of text. (Include null)
ailiaTokenizerOpenAddedTokensFileA
int AILIA_API ailiaTokenizerOpenAddedTokensFileA(struct AILIATokenizer *net, const char *path)
Open added tokens file.
ailiaTokenizerOpenVocabFileA
int AILIA_API ailiaTokenizerOpenVocabFileA(struct AILIATokenizer *net, const char *path)
Open vocab file.
ailiaTokenizerUtf8ToUtf32
int AILIA_API ailiaTokenizerUtf8ToUtf32(unsigned int *utf32, unsigned int *processed_byte, const char *utf8, unsigned int utf8_len)
Convert UTF8 character to UTF32 character.
ailiaTokenizerCreate
int AILIA_API ailiaTokenizerCreate(struct AILIATokenizer **net, int type, int flags)
Creates a tokenizer instance.
ailiaTokenizerGetText
int AILIA_API ailiaTokenizerGetText(struct AILIATokenizer *net, char *text, unsigned int len)
Gets the decoded text.
ailiaTokenizerDestroy
void AILIA_API ailiaTokenizerDestroy(struct AILIATokenizer *net)
It destroys the tokenizer instance.
ailiaTokenizerDecode
int AILIA_API ailiaTokenizerDecode(struct AILIATokenizer *net, const int *tokens, unsigned int token_count)
Perform decode.
ailiaTokenizerEncode
int AILIA_API ailiaTokenizerEncode(struct AILIATokenizer *net, const char *utf8)
Perform encode.
ailiaTokenizerUtf32ToUtf8
int AILIA_API ailiaTokenizerUtf32ToUtf8(char *utf8, unsigned int *processed_byte, unsigned int utf32)
Convert UTF32 character to UTF8 character.
ailiaTokenizerOpenModelFileW
int AILIA_API ailiaTokenizerOpenModelFileW(struct AILIATokenizer *net, const wchar_t *path)
ailiaTokenizerGetVocabSize
int AILIA_API ailiaTokenizerGetVocabSize(struct AILIATokenizer *net, unsigned int *size)
Gets the size of vocab. (Include null)
ailiaTokenizerOpenVocabFileW
int AILIA_API ailiaTokenizerOpenVocabFileW(struct AILIATokenizer *net, const wchar_t *path)
ailiaTokenizerGetVocab
int AILIA_API ailiaTokenizerGetVocab(struct AILIATokenizer *net, int token, const char **vocab)
Perform encode.
ailiaTokenizerOpenTokenizerConfigFileW
int AILIA_API ailiaTokenizerOpenTokenizerConfigFileW(struct AILIATokenizer *net, const wchar_t *path)
ailiaTokenizerGetTokens
int AILIA_API ailiaTokenizerGetTokens(struct AILIATokenizer *net, int *tokens, unsigned int count)
Gets the tokens.
ailiaTokenizerGetCharStarts
int AILIA_API ailiaTokenizerGetCharStarts(struct AILIATokenizer *net, int *char_starts, unsigned int count)
Gets the Char start positions.
ailiaTokenizerOpenTokenizerConfigFileA
int AILIA_API ailiaTokenizerOpenTokenizerConfigFileA(struct AILIATokenizer *net, const char *path)
Open tokenizer config file.
ailiaTokenizerOpenDictionaryFileW
int AILIA_API ailiaTokenizerOpenDictionaryFileW(struct AILIATokenizer *net, const wchar_t *path)
AILIA_API
#define AILIA_API
Definition: ailia_tokenizer.h:22
ailiaTokenizerGetWordIDs
int AILIA_API ailiaTokenizerGetWordIDs(struct AILIATokenizer *net, int *word_ids, unsigned int count)
Gets the word ID.
ailiaTokenizerOpenMergeFileA
int AILIA_API ailiaTokenizerOpenMergeFileA(struct AILIATokenizer *net, const char *path)
Open merges file.
ailiaTokenizerOpenModelFileA
int AILIA_API ailiaTokenizerOpenModelFileA(struct AILIATokenizer *net, const char *path)
Open model file.
ailiaTokenizerOpenAddedTokensFileW
int AILIA_API ailiaTokenizerOpenAddedTokensFileW(struct AILIATokenizer *net, const wchar_t *path)
ailiaTokenizerGetTokenCount
int AILIA_API ailiaTokenizerGetTokenCount(struct AILIATokenizer *net, unsigned int *count)
Gets the number of tokens.
ailiaTokenizerOpenDictionaryFileA
int AILIA_API ailiaTokenizerOpenDictionaryFileA(struct AILIATokenizer *net, const char *path)
Open dictionary file.
ailiaTokenizerEncodeWithSpecialTokens
int AILIA_API ailiaTokenizerEncodeWithSpecialTokens(struct AILIATokenizer *net, const char *utf8)
Perform encode with special tokens.
ailiaTokenizerOpenMergeFileW
int AILIA_API ailiaTokenizerOpenMergeFileW(struct AILIATokenizer *net, const wchar_t *path)
ailiaTokenizerGetCharEnds
int AILIA_API ailiaTokenizerGetCharEnds(struct AILIATokenizer *net, int *char_ends, unsigned int count)
Gets the Char end positions.
ailiaTokenizerDecodeWithSpecialTokens
int AILIA_API ailiaTokenizerDecodeWithSpecialTokens(struct AILIATokenizer *net, const int *tokens, unsigned int token_count)
Perform decode with special tokens.