Generate code for ISO-639-2/B mapping from iso-codes package
This commit is contained in:
parent
2e5ef20af3
commit
cae88246ec
|
@ -175,8 +175,9 @@ set(TEST_SRC_FILES
|
|||
tests/tagvalue.cpp
|
||||
tests/testfilecheck.cpp
|
||||
tests/utils.cpp)
|
||||
|
||||
set(DOC_FILES README.md doc/adding-new-fields.md)
|
||||
set(LANGUAGE_HEADER "${CMAKE_CURRENT_BINARY_DIR}/resources/iso_language_codes.h")
|
||||
set(RES_FILES "${LANGUAGE_HEADER}")
|
||||
|
||||
# find c++utilities
|
||||
set(CONFIGURATION_PACKAGE_SUFFIX
|
||||
|
@ -203,16 +204,9 @@ include(TestTarget)
|
|||
include(Doxygen)
|
||||
include(ConfigHeader)
|
||||
|
||||
# write languages header from CSV file
|
||||
set(LANGUAGES_HEADER "static const std::unordered_map<std::string, std::string> languages = \{")
|
||||
file(STRINGS languages.csv LANGUAGE_ROWS ENCODING UTF-8)
|
||||
foreach (LANGUAGE_ROW ${LANGUAGE_ROWS})
|
||||
if (NOT LANGUAGE_ROW MATCHES "([a-z][a-z]) ,([a-z][a-z][a-z]) ,([a-z][a-z][a-z]) ,\"?([^\",]*) \"?,\"?([^\",]*) \"?")
|
||||
continue()
|
||||
endif ()
|
||||
set(LANGUAGE_ABBREVIATION "${CMAKE_MATCH_3}")
|
||||
set(LANGUAGE_NAME "${CMAKE_MATCH_4}")
|
||||
set(LANGUAGES_HEADER "${LANGUAGES_HEADER}\n \{\"${LANGUAGE_ABBREVIATION}\", \"${LANGUAGE_NAME}\"\},")
|
||||
endforeach ()
|
||||
set(LANGUAGES_HEADER "${LANGUAGES_HEADER}\n};")
|
||||
file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/resources/languages.h" "${LANGUAGES_HEADER}")
|
||||
# add rules to generate code for dealing with language codes
|
||||
add_custom_command(
|
||||
OUTPUT "${LANGUAGE_HEADER}"
|
||||
COMMENT "Generating code for ISO-639-2 language codes"
|
||||
COMMAND "${CMAKE_COMMAND}" "-DOUTPUT_PATH=${LANGUAGE_HEADER}" -P
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/cmake/scripts/generate_iso_language_codes.cmake")
|
||||
|
|
|
@ -106,8 +106,8 @@ works with all kinds of files. (When forcing rewrite a backup is always created.
|
|||
## Build instructions
|
||||
The tagparser library depends on [c++utilities](https://github.com/Martchus/cpp-utilities) and is built
|
||||
in the same way.
|
||||
It also depends on zlib. For checking integrity of testfiles, the OpenSSL crypto
|
||||
library is required.
|
||||
It also depends on zlib, iso-codes and requires at least CMake 3.19. For checking integrity of testfiles, the OpenSSL
|
||||
crypto library is required.
|
||||
|
||||
## TODOs
|
||||
* Support more formats (EXIF, PDF metadata, Theora, ...)
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
#!/usr/bin/cmake -DOUTPUT_PATH=iso_language_codes.cpp -P
|
||||
# generates C++ code for ISO-639-2 language codes
|
||||
cmake_minimum_required(VERSION 3.19.0 FATAL_ERROR)
|
||||
|
||||
if (NOT LANGUAGE_FILE)
|
||||
# default to path provided usually by iso-codecs package (https://salsa.debian.org/iso-codes-team/iso-codes)
|
||||
set(LANGUAGE_FILE "/usr/share/iso-codes/json/iso_639-2.json")
|
||||
endif ()
|
||||
if (NOT EXISTS "${LANGUAGE_FILE}")
|
||||
message(FATAL_ERROR "The file ${LANGUAGE_FILE} does not exist.")
|
||||
else()
|
||||
endif()
|
||||
if (NOT OUTPUT_PATH)
|
||||
message(FATAL_ERROR "No OUTPUT_PATH specified.")
|
||||
endif()
|
||||
|
||||
set(OUTPUT "static const std::unordered_map<std::string, std::string> languageNames_iso_639_2_b = {\n")
|
||||
file(READ "${LANGUAGE_FILE}" LANGUAGE_JSON)
|
||||
string(JSON LANGUAGE_COUNT LENGTH "${LANGUAGE_JSON}" "639-2")
|
||||
message(STATUS "Found ${LANGUAGE_COUNT} ISO-639-2 language codes")
|
||||
math(EXPR LANGUAGE_COUNT "${LANGUAGE_COUNT} - 1")
|
||||
foreach (LANGUAGE_INDEX RANGE "${LANGUAGE_COUNT}")
|
||||
string(JSON LANGUAGE_ENTRY GET "${LANGUAGE_JSON}" "639-2" "${LANGUAGE_INDEX}")
|
||||
string(JSON LANGUAGE_ALPHA_2 ERROR_VARIABLE ERROR GET "${LANGUAGE_ENTRY}" "alpha_2")
|
||||
string(JSON LANGUAGE_ALPHA_3 ERROR_VARIABLE ERROR GET "${LANGUAGE_ENTRY}" "alpha_3")
|
||||
string(JSON LANGUAGE_BIBLIOGRAPHIC ERROR_VARIABLE ERROR GET "${LANGUAGE_ENTRY}" "bibliographic" "name")
|
||||
string(JSON LANGUAGE_NAME ERROR_VARIABLE ERROR GET "${LANGUAGE_ENTRY}" "name")
|
||||
if (NOT LANGUAGE_BIBLIOGRAPHIC)
|
||||
set(LANGUAGE_BIBLIOGRAPHIC "${LANGUAGE_ALPHA_3}")
|
||||
endif ()
|
||||
set(OUTPUT "${OUTPUT} {\"${LANGUAGE_BIBLIOGRAPHIC}\", \"${LANGUAGE_NAME}\"},\n")
|
||||
endforeach ()
|
||||
set(OUTPUT "${OUTPUT}};\n")
|
||||
file(WRITE "${OUTPUT_PATH}" "${OUTPUT}")
|
||||
message(STATUS "Wrote language entries to ${OUTPUT_PATH}")
|
186
languages.csv
186
languages.csv
|
@ -1,186 +0,0 @@
|
|||
639-1 ,639-2/T ,639-2/B ,Language name ,Native name
|
||||
aa ,aar ,aar ,Afar ,Afaraf
|
||||
ab ,abk ,abk ,Abkhaz ,"аҧсуа бызшәа, аҧсшәа "
|
||||
ae ,ave ,ave ,Avestan ,avesta
|
||||
af ,afr ,afr ,Afrikaans ,Afrikaans
|
||||
ak ,aka ,aka ,Akan ,Akan
|
||||
am ,amh ,amh ,Amharic ,አማርኛ
|
||||
an ,arg ,arg ,Aragonese ,aragonés
|
||||
ar ,ara ,ara ,Arabic ,العربية
|
||||
as ,asm ,asm ,Assamese ,অসমীয়া
|
||||
av ,ava ,ava ,Avaric ,"авар мацӀ, магӀарул мацӀ "
|
||||
ay ,aym ,aym ,Aymara ,aymar aru
|
||||
az ,aze ,aze ,Azerbaijani ,azərbaycan dili
|
||||
az ,azb ,azb ,South Azerbaijani ,تورکجه
|
||||
ba ,bak ,bak ,Bashkir ,башҡорт теле
|
||||
be ,bel ,bel ,Belarusian ,беларуская мова
|
||||
bg ,bul ,bul ,Bulgarian ,български език
|
||||
bh ,bih ,bih ,Bihari ,भोजपुरी
|
||||
bi ,bis ,bis ,Bislama ,Bislama
|
||||
bm ,bam ,bam ,Bambara ,bamanankan
|
||||
bn ,ben ,ben ,Bengali; Bangla ,বাংলা
|
||||
bo ,bod ,tib ,"Tibetan Standard, Tibetan, Central ",བོད་ཡིག
|
||||
br ,bre ,bre ,Breton ,brezhoneg
|
||||
bs ,bos ,bos ,Bosnian ,bosanski jezik
|
||||
ca ,cat ,cat ,Catalan; Valencian ,"català, valencià "
|
||||
ce ,che ,che ,Chechen ,нохчийн мотт
|
||||
ch ,cha ,cha ,Chamorro ,Chamoru
|
||||
co ,cos ,cos ,Corsican ,"corsu, lingua corsa "
|
||||
cr ,cre ,cre ,Cree ,ᓀᐦᐃᔭᐍᐏᐣ
|
||||
cs ,ces ,cze ,Czech ,"čeština, český jazyk "
|
||||
cu ,chu ,chu ,"Old Church Slavonic, Church Slavonic, Old Bulgarian ",ѩзыкъ словѣньскъ
|
||||
cv ,chv ,chv ,Chuvash ,чӑваш чӗлхи
|
||||
cy ,cym ,wel ,Welsh ,Cymraeg
|
||||
da ,dan ,dan ,Danish ,dansk
|
||||
de ,deu ,ger ,German ,Deutsch
|
||||
dv ,div ,div ,Divehi; Dhivehi; Maldivian; ,ދިވެހި
|
||||
dz ,dzo ,dzo ,Dzongkha ,རྫོང་ཁ
|
||||
ee ,ewe ,ewe ,Ewe ,Eʋegbe
|
||||
el ,ell ,gre ,"Greek, Modern ",ελληνικά
|
||||
en ,eng ,eng ,English ,English
|
||||
eo ,epo ,epo ,Esperanto ,Esperanto
|
||||
es ,spa ,spa ,Spanish; Castilian ,"español, castellano "
|
||||
et ,est ,est ,Estonian ,"eesti, eesti keel "
|
||||
eu ,eus ,baq ,Basque ,"euskara, euskera "
|
||||
fa ,fas ,per ,Persian (Farsi) ,فارسی
|
||||
ff ,ful ,ful ,Fula; Fulah; Pulaar; Pular ,"Fulfulde, Pulaar, Pular "
|
||||
fi ,fin ,fin ,Finnish ,"suomi, suomen kieli "
|
||||
fj ,fij ,fij ,Fijian ,vosa Vakaviti
|
||||
fo ,fao ,fao ,Faroese ,føroyskt
|
||||
fr ,fra ,fre ,French ,"français, langue française "
|
||||
fy ,fry ,fry ,Western Frisian ,Frysk
|
||||
ga ,gle ,gle ,Irish ,Gaeilge
|
||||
gd ,gla ,gla ,Scottish Gaelic; Gaelic ,Gàidhlig
|
||||
gl ,glg ,glg ,Galician ,galego
|
||||
gn ,grn ,grn ,Guaraní ,Avañe'ẽ
|
||||
gu ,guj ,guj ,Gujarati ,ગુજરાતી
|
||||
gv ,glv ,glv ,Manx ,"Gaelg, Gailck "
|
||||
ha ,hau ,hau ,Hausa ,"Hausa, هَوُسَ "
|
||||
he ,heb ,heb ,Hebrew (modern) ,עברית
|
||||
hi ,hin ,hin ,Hindi ,"हिन्दी, हिंदी "
|
||||
ho ,hmo ,hmo ,Hiri Motu ,Hiri Motu
|
||||
hr ,hrv ,hrv ,Croatian ,hrvatski jezik
|
||||
ht ,hat ,hat ,Haitian; Haitian Creole ,Kreyòl ayisyen
|
||||
hu ,hun ,hun ,Hungarian ,magyar
|
||||
hy ,hye ,arm ,Armenian ,Հայերեն
|
||||
hz ,her ,her ,Herero ,Otjiherero
|
||||
ia ,ina ,ina ,Interlingua ,Interlingua
|
||||
id ,ind ,ind ,Indonesian ,Bahasa Indonesia
|
||||
ie ,ile ,ile ,Interlingue ,Originally called Occidental; then Interlingue after WWII
|
||||
ig ,ibo ,ibo ,Igbo ,Asụsụ Igbo
|
||||
ii ,iii ,iii ,Nuosu ,ꆈꌠ꒿ Nuosuhxop
|
||||
ik ,ipk ,ipk ,Inupiaq ,"Iñupiaq, Iñupiatun "
|
||||
io ,ido ,ido ,Ido ,Ido
|
||||
is ,isl ,ice ,Icelandic ,Íslenska
|
||||
it ,ita ,ita ,Italian ,italiano
|
||||
iu ,iku ,iku ,Inuktitut ,ᐃᓄᒃᑎᑐᑦ
|
||||
ja ,jpn ,jpn ,Japanese ,日本語 (にほんご)
|
||||
jv ,jav ,jav ,Javanese ,basa Jawa
|
||||
ka ,kat ,geo ,Georgian ,ქართული
|
||||
kg ,kon ,kon ,Kongo ,KiKongo
|
||||
ki ,kik ,kik ,"Kikuyu, Gikuyu ",Gĩkũyũ
|
||||
kj ,kua ,kua ,"Kwanyama, Kuanyama ",Kuanyama
|
||||
kk ,kaz ,kaz ,Kazakh ,қазақ тілі
|
||||
kl ,kal ,kal ,"Kalaallisut, Greenlandic ","kalaallisut, kalaallit oqaasii "
|
||||
km ,khm ,khm ,Khmer ,"ខ្មែរ, ខេមរភាសា, ភាសាខ្មែរ "
|
||||
kn ,kan ,kan ,Kannada ,ಕನ್ನಡ
|
||||
ko ,kor ,kor ,Korean ,"한국어 (韓國語), 조선어 (朝鮮語) "
|
||||
kr ,kau ,kau ,Kanuri ,Kanuri
|
||||
ks ,kas ,kas ,Kashmiri ,"कश्मीरी, كشميري "
|
||||
ku ,kur ,kur ,Kurdish ,"Kurdî, كوردی "
|
||||
kv ,kom ,kom ,Komi ,коми кыв
|
||||
kw ,cor ,cor ,Cornish ,Kernewek
|
||||
ky ,kir ,kir ,Kyrgyz ,"Кыргызча, Кыргыз тили "
|
||||
la ,lat ,lat ,Latin ,"latine, lingua latina "
|
||||
lb ,ltz ,ltz ,"Luxembourgish, Letzeburgesch ",Lëtzebuergesch
|
||||
lg ,lug ,lug ,Ganda ,Luganda
|
||||
li ,lim ,lim ,"Limburgish, Limburgan, Limburger ",Limburgs
|
||||
ln ,lin ,lin ,Lingala ,Lingála
|
||||
lo ,lao ,lao ,Lao ,ພາສາລາວ
|
||||
lt ,lit ,lit ,Lithuanian ,lietuvių kalba
|
||||
lu ,lub ,lub ,Luba-Katanga ,Tshiluba
|
||||
lv ,lav ,lav ,Latvian ,latviešu valoda
|
||||
mg ,mlg ,mlg ,Malagasy ,fiteny malagasy
|
||||
mh ,mah ,mah ,Marshallese ,Kajin M̧ajeļ
|
||||
mi ,mri ,mao ,Māori ,te reo Māori
|
||||
mk ,mkd ,mac ,Macedonian ,македонски јазик
|
||||
ml ,mal ,mal ,Malayalam ,മലയാളം
|
||||
mn ,mon ,mon ,Mongolian ,монгол
|
||||
mr ,mar ,mar ,Marathi (Marāṭhī) ,मराठी
|
||||
ms ,msa ,may ,Malay ,"bahasa Melayu, بهاس ملايو "
|
||||
mt ,mlt ,mlt ,Maltese ,Malti
|
||||
my ,mya ,bur ,Burmese ,ဗမာစာ
|
||||
na ,nau ,nau ,Nauru ,Ekakairũ Naoero
|
||||
nb ,nob ,nob ,Norwegian Bokmål ,Norsk bokmål
|
||||
nd ,nde ,nde ,North Ndebele ,isiNdebele
|
||||
ne ,nep ,nep ,Nepali ,नेपाली
|
||||
ng ,ndo ,ndo ,Ndonga ,Owambo
|
||||
nl ,nld ,dut ,Dutch ,"Nederlands, Vlaams "
|
||||
nn ,nno ,nno ,Norwegian Nynorsk ,Norsk nynorsk
|
||||
no ,nor ,nor ,Norwegian ,Norsk
|
||||
nr ,nbl ,nbl ,South Ndebele ,isiNdebele
|
||||
nv ,nav ,nav ,"Navajo, Navaho ","Diné bizaad, Dinékʼehǰí "
|
||||
ny ,nya ,nya ,Chichewa; Chewa; Nyanja ,"chiCheŵa, chinyanja "
|
||||
oc ,oci ,oci ,Occitan ,"occitan, lenga d'òc "
|
||||
oj ,oji ,oji ,"Ojibwe, Ojibwa ",ᐊᓂᔑᓈᐯᒧᐎᓐ
|
||||
om ,orm ,orm ,Oromo ,Afaan Oromoo
|
||||
or ,ori ,ori ,Oriya ,ଓଡ଼ିଆ
|
||||
os ,oss ,oss ,"Ossetian, Ossetic ",ирон æвзаг
|
||||
pa ,pan ,pan ,"Panjabi, Punjabi ","ਪੰਜਾਬੀ, پنجابی "
|
||||
pi ,pli ,pli ,Pāli ,पाऴि
|
||||
pl ,pol ,pol ,Polish ,"język polski, polszczyzna "
|
||||
ps ,pus ,pus ,"Pashto, Pushto ",پښتو
|
||||
pt ,por ,por ,Portuguese ,português
|
||||
qu ,que ,que ,Quechua ,"Runa Simi, Kichwa "
|
||||
rm ,roh ,roh ,Romansh ,rumantsch grischun
|
||||
rn ,run ,run ,Kirundi ,Ikirundi
|
||||
ro ,ron ,rum ,Romanian ,limba română
|
||||
ru ,rus ,rus ,Russian ,русский язык
|
||||
rw ,kin ,kin ,Kinyarwanda ,Ikinyarwanda
|
||||
sa ,san ,san ,Sanskrit (Saṁskṛta) ,संस्कृतम्
|
||||
sc ,srd ,srd ,Sardinian ,sardu
|
||||
sd ,snd ,snd ,Sindhi ,"सिन्धी, سنڌي، سندھی "
|
||||
se ,sme ,sme ,Northern Sami ,Davvisámegiella
|
||||
sg ,sag ,sag ,Sango ,yângâ tî sängö
|
||||
si ,sin ,sin ,"Sinhala, Sinhalese ",සිංහල
|
||||
sk ,slk ,slo ,Slovak ,"slovenčina, slovenský jazyk "
|
||||
sl ,slv ,slv ,Slovene ,"slovenski jezik, slovenščina "
|
||||
sm ,smo ,smo ,Samoan ,gagana fa'a Samoa
|
||||
sn ,sna ,sna ,Shona ,chiShona
|
||||
so ,som ,som ,Somali ,"Soomaaliga, af Soomaali "
|
||||
sq ,sqi ,alb ,Albanian ,gjuha shqipe
|
||||
sr ,srp ,srp ,Serbian ,српски језик
|
||||
ss ,ssw ,ssw ,Swati ,SiSwati
|
||||
st ,sot ,sot ,Southern Sotho ,Sesotho
|
||||
su ,sun ,sun ,Sundanese ,Basa Sunda
|
||||
sv ,swe ,swe ,Swedish ,Svenska
|
||||
sw ,swa ,swa ,Swahili ,Kiswahili
|
||||
ta ,tam ,tam ,Tamil ,தமிழ்
|
||||
te ,tel ,tel ,Telugu ,తెలుగు
|
||||
tg ,tgk ,tgk ,Tajik ,"тоҷикӣ, toğikī, تاجیکی "
|
||||
th ,tha ,tha ,Thai ,ไทย
|
||||
ti ,tir ,tir ,Tigrinya ,ትግርኛ
|
||||
tk ,tuk ,tuk ,Turkmen ,"Türkmen, Түркмен "
|
||||
tl ,tgl ,tgl ,Tagalog ,"Wikang Tagalog, ᜏᜒᜃᜅ᜔ ᜆᜄᜎᜓᜄ᜔ "
|
||||
tn ,tsn ,tsn ,Tswana ,Setswana
|
||||
to ,ton ,ton ,Tonga (Tonga Islands) ,faka Tonga
|
||||
tr ,tur ,tur ,Turkish ,Türkçe
|
||||
ts ,tso ,tso ,Tsonga ,Xitsonga
|
||||
tt ,tat ,tat ,Tatar ,"татар теле, tatar tele "
|
||||
tw ,twi ,twi ,Twi ,Twi
|
||||
ty ,tah ,tah ,Tahitian ,Reo Tahiti
|
||||
ug ,uig ,uig ,"Uyghur, Uighur ","Uyƣurqə, ئۇيغۇرچە "
|
||||
uk ,ukr ,ukr ,Ukrainian ,українська мова
|
||||
ur ,urd ,urd ,Urdu ,اردو
|
||||
uz ,uzb ,uzb ,Uzbek ,"O‘zbek, Ўзбек, أۇزبېك "
|
||||
ve ,ven ,ven ,Venda ,Tshivenḓa
|
||||
vi ,vie ,vie ,Vietnamese ,Tiếng Việt
|
||||
vo ,vol ,vol ,Volapük ,Volapük
|
||||
wa ,wln ,wln ,Walloon ,walon
|
||||
wo ,wol ,wol ,Wolof ,Wollof
|
||||
xh ,xho ,xho ,Xhosa ,isiXhosa
|
||||
yi ,yid ,yid ,Yiddish ,ייִדיש
|
||||
yo ,yor ,yor ,Yoruba ,Yorùbá
|
||||
za ,zha ,zha ,"Zhuang, Chuang ","Saɯ cueŋƅ, Saw cuengh "
|
||||
zh ,zho ,chi ,Chinese ,"中文 (Zhōngwén), 汉语, 漢語 "
|
||||
zu ,zul ,zul ,Zulu ,isiZulu
|
|
|
@ -9,10 +9,10 @@ using namespace std::literals;
|
|||
namespace TagParser {
|
||||
|
||||
/// \cond
|
||||
static const auto &languageMapping()
|
||||
static const auto &languageNames_ISO_639_2_b()
|
||||
{
|
||||
#include "resources/languages.h"
|
||||
return languages;
|
||||
#include "resources/iso_language_codes.h"
|
||||
return languageNames_iso_639_2_b;
|
||||
}
|
||||
/// \endcond
|
||||
|
||||
|
@ -30,7 +30,7 @@ inline static bool isLanguageDefined_ISO_639_2(const std::string &languageSpecif
|
|||
*/
|
||||
static const std::string &languageName_ISO_639_2(const std::string &isoCode)
|
||||
{
|
||||
const auto &mapping = languageMapping();
|
||||
const auto &mapping = languageNames_ISO_639_2_b();
|
||||
const auto i = mapping.find(isoCode);
|
||||
if (i == mapping.cend()) {
|
||||
static const std::string empty;
|
||||
|
|
Loading…
Reference in New Issue