mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-05-11 16:40:16 +00:00
* initial commit * cleanup * fix whitelist arg parsing and simplify keyword search state * rename white* to allow* * add vocab_pieces init function, rename update functions, delete accidentally added file * delete temporary bias code * auto-generate fill function with script data inside * deduplicate allowlist unicode rule parsing * minor cleanup * delete unnecessary header * refactor allowlist to support sequential rule sets via keywords * add early exit for zero-rules case * delete accidentally added file
111 lines
2.6 KiB
Python
111 lines
2.6 KiB
Python
|
|
from collections import defaultdict
|
|
|
|
import requests
|
|
|
|
MAX_CODEPOINTS = 0x110000
|
|
|
|
SCRIPT_DATA_URL = "https://www.unicode.org/Public/UCD/latest/ucd/Scripts.txt"
|
|
|
|
|
|
res = requests.get(SCRIPT_DATA_URL)
|
|
res.raise_for_status()
|
|
data = res.content.decode()
|
|
|
|
cptL_cptU_script = []
|
|
for line in data.splitlines():
|
|
line = line.split()
|
|
if len(line) <= 1 or line[0] == "#":
|
|
continue
|
|
|
|
cpt = line[0].split("..")
|
|
if len(cpt) == 1:
|
|
cpt += cpt
|
|
cpt_lower, cpt_upper = cpt
|
|
|
|
cpt_lower = int(cpt_lower, 16)
|
|
if cpt_lower >= MAX_CODEPOINTS:
|
|
break
|
|
|
|
cpt_upper = int(cpt_upper, 16)
|
|
if cpt_upper >= MAX_CODEPOINTS:
|
|
break
|
|
|
|
assert line[1] == ";"
|
|
|
|
script = line[2].lower()
|
|
|
|
assert line[3] == "#"
|
|
|
|
# categ = line[4]
|
|
# assert len(categ) == 2
|
|
|
|
cptL_cptU_script.append([cpt_lower, cpt_upper, script])
|
|
|
|
cptL_cptU_script.sort(key=lambda x: x[0]) # just in case
|
|
|
|
# merge neighboring codepoints that belong to same script
|
|
im = 0 # merge index
|
|
for cpt_lower, cpt_upper, script in cptL_cptU_script[1:]:
|
|
if (cptL_cptU_script[im][2] == script) and (cptL_cptU_script[im][1] + 1 == cpt_lower):
|
|
cptL_cptU_script[im][1] = cpt_upper
|
|
else:
|
|
im += 1
|
|
cptL_cptU_script[im] = [cpt_lower, cpt_upper, script]
|
|
del cptL_cptU_script[im + 1:]
|
|
|
|
def out(line=""):
|
|
print(line, end='\n') # noqa
|
|
|
|
# Generate 'unicode-script-data.cpp':
|
|
# python scripts/gen-unicode-script-data.py > src/unicode-script-data.cpp
|
|
|
|
out("""\
|
|
// generated with scripts/gen-unicode-script-data.py
|
|
|
|
#include "unicode.h"
|
|
#include "unicode-data.h"
|
|
""")
|
|
|
|
out("""\
|
|
size_t unicode_fill_from_utf8(std::string* utf8, std::vector<uint32_t>* dst_cpts, std::vector<std::string>* dst_scripts) {
|
|
if (utf8 == nullptr) {
|
|
return 0;
|
|
}
|
|
""")
|
|
|
|
out("static const std::vector<std::string> unicode_scripts = {")
|
|
for _, _, script in cptL_cptU_script:
|
|
out(" \"%s\"," % script)
|
|
out("};")
|
|
|
|
out("static const std::vector<uint32_t> unicode_script_lasts = {")
|
|
for _, cpt_upper, _ in cptL_cptU_script:
|
|
out(" 0x%06X," % cpt_upper)
|
|
out("};")
|
|
|
|
out("""\
|
|
const auto cpts = unicode_cpts_from_utf8(*utf8);
|
|
const size_t n_cpt = cpts.size();
|
|
|
|
std::vector<std::string> scripts;
|
|
scripts.reserve(n_cpt);
|
|
|
|
for (const auto& cpt: cpts) {
|
|
const auto it = std::lower_bound(unicode_script_lasts.begin(), unicode_script_lasts.end(), cpt);
|
|
if (it != unicode_script_lasts.end()) {
|
|
scripts.push_back(unicode_scripts[std::distance(unicode_script_lasts.begin(), it)]);
|
|
}
|
|
}
|
|
|
|
if (dst_cpts != nullptr) {
|
|
*dst_cpts = cpts;
|
|
}
|
|
if (dst_scripts != nullptr) {
|
|
*dst_scripts = scripts;
|
|
}
|
|
|
|
return n_cpt;
|
|
}
|
|
""")
|