Custom Tokenizers

Tokenizers are custom C functions that allow you to turn a stream of characters into “tokens”. Tokens can be anything you want, you can break on whitespaces, special characters, etc. They are meant to help you break the characters for full-text search queries to be more accurate.

op-sqlite has a novel way for you to create your tokenizers.

Install the beta branch: npm i -s @op-engineering/[email protected]

Declare which tokenizers you want on the package.json:

"op-sqlite": {
	// Leave whatever configuration you already have
	"fts5": true, // fts needs to be enabled
	"tokenizers": ["word_tokenizer"] // declare which tokenizers you will create
}

Run pod install. The podspec now contains a code generation step. It will create a c_sources folder at the root of your project. It will create a tokenizers.h file. DON’T TOUCH THIS FILE. It will be overwritten every time. You need to create a c_sources/tokenizers.cpp file. Here you need to provide your tokenizer implementation. The tokenizer.h file contains the function declaration that will be executed when registering your tokenizer. In this case here is a sample tokenizers.cpp implementation

#include "tokenizers.h"
#include <cctype>
#include <memory>
#include <string>

namespace opsqlite {

fts5_api *fts5_api_from_db(sqlite3 *db) {
  fts5_api *pRet = 0;
  sqlite3_stmt *pStmt = 0;

  if (SQLITE_OK == sqlite3_prepare_v2(db, "SELECT fts5(?1)", -1, &pStmt, 0)) {
    sqlite3_bind_pointer(pStmt, 1, (void *)&pRet, "fts5_api_ptr", NULL);
    sqlite3_step(pStmt);
  }
  sqlite3_finalize(pStmt);
  return pRet;
}

class WordTokenizer {
public:
  WordTokenizer() = default;
  ~WordTokenizer() = default;
};

// Define `xCreate`, which initializes the tokenizer
int wordTokenizerCreate(void *pUnused, const char **azArg, int nArg,
                        Fts5Tokenizer **ppOut) {
  auto tokenizer = std::make_unique<WordTokenizer>();
  *ppOut = reinterpret_cast<Fts5Tokenizer *>(
      tokenizer.release()); // Cast to Fts5Tokenizer*
  return SQLITE_OK;
}

// Define `xDelete`, which frees the tokenizer
void wordTokenizerDelete(Fts5Tokenizer *pTokenizer) {
  delete reinterpret_cast<WordTokenizer *>(pTokenizer);
}

// Define `xTokenize`, which performs the actual tokenization
int wordTokenizerTokenize(Fts5Tokenizer *pTokenizer, void *pCtx, int flags,
                          const char *pText, int nText,
                          int (*xToken)(void *, int, const char *, int, int,
                                        int)) {
  int start = 0;
  int i = 0;

  while (i <= nText) {
    if (i == nText || !std::isalnum(static_cast<unsigned char>(pText[i]))) {
      if (start < i) { // Found a token
        int rc = xToken(pCtx, 0, pText + start, i - start, start, i);
        if (rc != SQLITE_OK)
          return rc;
      }
      start = i + 1;
    }
    i++;
  }
  return SQLITE_OK;
}

int opsqlite_word_tokenizer_init(sqlite3 *db, char **error,
                         sqlite3_api_routines const *api) {
  fts5_tokenizer wordtokenizer = {wordTokenizerCreate, wordTokenizerDelete,
                                  wordTokenizerTokenize};

  fts5_api *ftsApi = (fts5_api *)fts5_api_from_db(db);
  if (ftsApi == NULL)
    return SQLITE_ERROR;

  return ftsApi->xCreateTokenizer(ftsApi, "word_tokenizer", NULL,
                                  &wordtokenizer, NULL);
}

} // namespace opsqlite

You need to keep the namespace and the function signature intact. For now the sqlite3_api_routines parameter will always be a null pointer.

Once you are done. You need to run pod install again. It will then copy the files you created to the pod sources in order to compile op-sqlite together with your new C++ code in one go.
The code generation step is only implemented in Cocoapods. Every time you create/change a file inside of c_sources you will need to do a pod install to re-add the newly created files into the compilation process. This also applies for Android, at least the header file generation step. On your CI, you will also need to do a pod install even if your pods are cached, in order to copy the sources.

You can then create a FTS5 virtual table with your tokenizer:

let db = open({
  name: "tokenizers.sqlite",
  encryptionKey: "test",
});

// inside your component or wherever you initialize your database
// THIS IS SAMPLE CODE, use your head when creating your tables
useEffect(() => {
  let setup = async () => {
    await db.execute(
      `CREATE VIRTUAL TABLE tokenizer_table USING fts5(content, tokenize = 'word_tokenizer');`
    );
    
    await db.execute("INSERT INTO tokenizer_table(content) VALUES (?)", [
      "This is a test document",
    ]);
    
    const res = await db.execute(
      "SELECT content FROM tokenizer_table WHERE content MATCH ?",
      ["test"]
    );
    
    console.warn(res);
  };
  setup();
}, []);