diff --git a/CMakeLists.txt b/CMakeLists.txt index c1ad691..eb8aebb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -175,8 +175,9 @@ set(TEST_SRC_FILES tests/tagvalue.cpp tests/testfilecheck.cpp tests/utils.cpp) - set(DOC_FILES README.md doc/adding-new-fields.md) +set(LANGUAGE_HEADER "${CMAKE_CURRENT_BINARY_DIR}/resources/iso_language_codes.h") +set(RES_FILES "${LANGUAGE_HEADER}") # find c++utilities set(CONFIGURATION_PACKAGE_SUFFIX @@ -203,16 +204,9 @@ include(TestTarget) include(Doxygen) include(ConfigHeader) -# write languages header from CSV file -set(LANGUAGES_HEADER "static const std::unordered_map languages = \{") -file(STRINGS languages.csv LANGUAGE_ROWS ENCODING UTF-8) -foreach (LANGUAGE_ROW ${LANGUAGE_ROWS}) - if (NOT LANGUAGE_ROW MATCHES "([a-z][a-z]) ,([a-z][a-z][a-z]) ,([a-z][a-z][a-z]) ,\"?([^\",]*) \"?,\"?([^\",]*) \"?") - continue() - endif () - set(LANGUAGE_ABBREVIATION "${CMAKE_MATCH_3}") - set(LANGUAGE_NAME "${CMAKE_MATCH_4}") - set(LANGUAGES_HEADER "${LANGUAGES_HEADER}\n \{\"${LANGUAGE_ABBREVIATION}\", \"${LANGUAGE_NAME}\"\},") -endforeach () -set(LANGUAGES_HEADER "${LANGUAGES_HEADER}\n};") -file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/resources/languages.h" "${LANGUAGES_HEADER}") +# add rules to generate code for dealing with language codes +add_custom_command( + OUTPUT "${LANGUAGE_HEADER}" + COMMENT "Generating code for ISO-639-2 language codes" + COMMAND "${CMAKE_COMMAND}" "-DOUTPUT_PATH=${LANGUAGE_HEADER}" -P + "${CMAKE_CURRENT_SOURCE_DIR}/cmake/scripts/generate_iso_language_codes.cmake") diff --git a/README.md b/README.md index 44d55a2..fe58c5c 100644 --- a/README.md +++ b/README.md @@ -106,8 +106,8 @@ works with all kinds of files. (When forcing rewrite a backup is always created. ## Build instructions The tagparser library depends on [c++utilities](https://github.com/Martchus/cpp-utilities) and is built in the same way. -It also depends on zlib. For checking integrity of testfiles, the OpenSSL crypto -library is required. +It also depends on zlib, iso-codes and requires at least CMake 3.19. For checking integrity of testfiles, the OpenSSL +crypto library is required. ## TODOs * Support more formats (EXIF, PDF metadata, Theora, ...) diff --git a/cmake/scripts/generate_iso_language_codes.cmake b/cmake/scripts/generate_iso_language_codes.cmake new file mode 100755 index 0000000..9d796ef --- /dev/null +++ b/cmake/scripts/generate_iso_language_codes.cmake @@ -0,0 +1,35 @@ +#!/usr/bin/cmake -DOUTPUT_PATH=iso_language_codes.cpp -P +# generates C++ code for ISO-639-2 language codes +cmake_minimum_required(VERSION 3.19.0 FATAL_ERROR) + +if (NOT LANGUAGE_FILE) + # default to path provided usually by iso-codecs package (https://salsa.debian.org/iso-codes-team/iso-codes) + set(LANGUAGE_FILE "/usr/share/iso-codes/json/iso_639-2.json") +endif () +if (NOT EXISTS "${LANGUAGE_FILE}") + message(FATAL_ERROR "The file ${LANGUAGE_FILE} does not exist.") +else() +endif() +if (NOT OUTPUT_PATH) + message(FATAL_ERROR "No OUTPUT_PATH specified.") +endif() + +set(OUTPUT "static const std::unordered_map languageNames_iso_639_2_b = {\n") +file(READ "${LANGUAGE_FILE}" LANGUAGE_JSON) +string(JSON LANGUAGE_COUNT LENGTH "${LANGUAGE_JSON}" "639-2") +message(STATUS "Found ${LANGUAGE_COUNT} ISO-639-2 language codes") +math(EXPR LANGUAGE_COUNT "${LANGUAGE_COUNT} - 1") +foreach (LANGUAGE_INDEX RANGE "${LANGUAGE_COUNT}") + string(JSON LANGUAGE_ENTRY GET "${LANGUAGE_JSON}" "639-2" "${LANGUAGE_INDEX}") + string(JSON LANGUAGE_ALPHA_2 ERROR_VARIABLE ERROR GET "${LANGUAGE_ENTRY}" "alpha_2") + string(JSON LANGUAGE_ALPHA_3 ERROR_VARIABLE ERROR GET "${LANGUAGE_ENTRY}" "alpha_3") + string(JSON LANGUAGE_BIBLIOGRAPHIC ERROR_VARIABLE ERROR GET "${LANGUAGE_ENTRY}" "bibliographic" "name") + string(JSON LANGUAGE_NAME ERROR_VARIABLE ERROR GET "${LANGUAGE_ENTRY}" "name") + if (NOT LANGUAGE_BIBLIOGRAPHIC) + set(LANGUAGE_BIBLIOGRAPHIC "${LANGUAGE_ALPHA_3}") + endif () + set(OUTPUT "${OUTPUT} {\"${LANGUAGE_BIBLIOGRAPHIC}\", \"${LANGUAGE_NAME}\"},\n") +endforeach () +set(OUTPUT "${OUTPUT}};\n") +file(WRITE "${OUTPUT_PATH}" "${OUTPUT}") +message(STATUS "Wrote language entries to ${OUTPUT_PATH}") diff --git a/languages.csv b/languages.csv deleted file mode 100644 index 8010dbd..0000000 --- a/languages.csv +++ /dev/null @@ -1,186 +0,0 @@ -639-1 ,639-2/T ,639-2/B ,Language name ,Native name -aa ,aar ,aar ,Afar ,Afaraf -ab ,abk ,abk ,Abkhaz ,"аҧсуа бызшәа, аҧсшәа " -ae ,ave ,ave ,Avestan ,avesta -af ,afr ,afr ,Afrikaans ,Afrikaans -ak ,aka ,aka ,Akan ,Akan -am ,amh ,amh ,Amharic ,አማርኛ -an ,arg ,arg ,Aragonese ,aragonés -ar ,ara ,ara ,Arabic ,العربية -as ,asm ,asm ,Assamese ,অসমীয়া -av ,ava ,ava ,Avaric ,"авар мацӀ, магӀарул мацӀ " -ay ,aym ,aym ,Aymara ,aymar aru -az ,aze ,aze ,Azerbaijani ,azərbaycan dili -az ,azb ,azb ,South Azerbaijani ,تورکجه‎ -ba ,bak ,bak ,Bashkir ,башҡорт теле -be ,bel ,bel ,Belarusian ,беларуская мова -bg ,bul ,bul ,Bulgarian ,български език -bh ,bih ,bih ,Bihari ,भोजपुरी -bi ,bis ,bis ,Bislama ,Bislama -bm ,bam ,bam ,Bambara ,bamanankan -bn ,ben ,ben ,Bengali; Bangla ,বাংলা -bo ,bod ,tib ,"Tibetan Standard, Tibetan, Central ",བོད་ཡིག -br ,bre ,bre ,Breton ,brezhoneg -bs ,bos ,bos ,Bosnian ,bosanski jezik -ca ,cat ,cat ,Catalan; Valencian ,"català, valencià " -ce ,che ,che ,Chechen ,нохчийн мотт -ch ,cha ,cha ,Chamorro ,Chamoru -co ,cos ,cos ,Corsican ,"corsu, lingua corsa " -cr ,cre ,cre ,Cree ,ᓀᐦᐃᔭᐍᐏᐣ -cs ,ces ,cze ,Czech ,"čeština, český jazyk " -cu ,chu ,chu ,"Old Church Slavonic, Church Slavonic, Old Bulgarian ",ѩзыкъ словѣньскъ -cv ,chv ,chv ,Chuvash ,чӑваш чӗлхи -cy ,cym ,wel ,Welsh ,Cymraeg -da ,dan ,dan ,Danish ,dansk -de ,deu ,ger ,German ,Deutsch -dv ,div ,div ,Divehi; Dhivehi; Maldivian; ,ދިވެހި -dz ,dzo ,dzo ,Dzongkha ,རྫོང་ཁ -ee ,ewe ,ewe ,Ewe ,Eʋegbe -el ,ell ,gre ,"Greek, Modern ",ελληνικά -en ,eng ,eng ,English ,English -eo ,epo ,epo ,Esperanto ,Esperanto -es ,spa ,spa ,Spanish; Castilian ,"español, castellano " -et ,est ,est ,Estonian ,"eesti, eesti keel " -eu ,eus ,baq ,Basque ,"euskara, euskera " -fa ,fas ,per ,Persian (Farsi) ,فارسی -ff ,ful ,ful ,Fula; Fulah; Pulaar; Pular ,"Fulfulde, Pulaar, Pular " -fi ,fin ,fin ,Finnish ,"suomi, suomen kieli " -fj ,fij ,fij ,Fijian ,vosa Vakaviti -fo ,fao ,fao ,Faroese ,føroyskt -fr ,fra ,fre ,French ,"français, langue française " -fy ,fry ,fry ,Western Frisian ,Frysk -ga ,gle ,gle ,Irish ,Gaeilge -gd ,gla ,gla ,Scottish Gaelic; Gaelic ,Gàidhlig -gl ,glg ,glg ,Galician ,galego -gn ,grn ,grn ,Guaraní ,Avañe'ẽ -gu ,guj ,guj ,Gujarati ,ગુજરાતી -gv ,glv ,glv ,Manx ,"Gaelg, Gailck " -ha ,hau ,hau ,Hausa ,"Hausa, هَوُسَ " -he ,heb ,heb ,Hebrew (modern) ,עברית -hi ,hin ,hin ,Hindi ,"हिन्दी, हिंदी " -ho ,hmo ,hmo ,Hiri Motu ,Hiri Motu -hr ,hrv ,hrv ,Croatian ,hrvatski jezik -ht ,hat ,hat ,Haitian; Haitian Creole ,Kreyòl ayisyen -hu ,hun ,hun ,Hungarian ,magyar -hy ,hye ,arm ,Armenian ,Հայերեն -hz ,her ,her ,Herero ,Otjiherero -ia ,ina ,ina ,Interlingua ,Interlingua -id ,ind ,ind ,Indonesian ,Bahasa Indonesia -ie ,ile ,ile ,Interlingue ,Originally called Occidental; then Interlingue after WWII -ig ,ibo ,ibo ,Igbo ,Asụsụ Igbo -ii ,iii ,iii ,Nuosu ,ꆈꌠ꒿ Nuosuhxop -ik ,ipk ,ipk ,Inupiaq ,"Iñupiaq, Iñupiatun " -io ,ido ,ido ,Ido ,Ido -is ,isl ,ice ,Icelandic ,Íslenska -it ,ita ,ita ,Italian ,italiano -iu ,iku ,iku ,Inuktitut ,ᐃᓄᒃᑎᑐᑦ -ja ,jpn ,jpn ,Japanese ,日本語 (にほんご) -jv ,jav ,jav ,Javanese ,basa Jawa -ka ,kat ,geo ,Georgian ,ქართული -kg ,kon ,kon ,Kongo ,KiKongo -ki ,kik ,kik ,"Kikuyu, Gikuyu ",Gĩkũyũ -kj ,kua ,kua ,"Kwanyama, Kuanyama ",Kuanyama -kk ,kaz ,kaz ,Kazakh ,қазақ тілі -kl ,kal ,kal ,"Kalaallisut, Greenlandic ","kalaallisut, kalaallit oqaasii " -km ,khm ,khm ,Khmer ,"ខ្មែរ, ខេមរភាសា, ភាសាខ្មែរ " -kn ,kan ,kan ,Kannada ,ಕನ್ನಡ -ko ,kor ,kor ,Korean ,"한국어 (韓國語), 조선어 (朝鮮語) " -kr ,kau ,kau ,Kanuri ,Kanuri -ks ,kas ,kas ,Kashmiri ,"कश्मीरी, كشميري‎ " -ku ,kur ,kur ,Kurdish ,"Kurdî, كوردی‎ " -kv ,kom ,kom ,Komi ,коми кыв -kw ,cor ,cor ,Cornish ,Kernewek -ky ,kir ,kir ,Kyrgyz ,"Кыргызча, Кыргыз тили " -la ,lat ,lat ,Latin ,"latine, lingua latina " -lb ,ltz ,ltz ,"Luxembourgish, Letzeburgesch ",Lëtzebuergesch -lg ,lug ,lug ,Ganda ,Luganda -li ,lim ,lim ,"Limburgish, Limburgan, Limburger ",Limburgs -ln ,lin ,lin ,Lingala ,Lingála -lo ,lao ,lao ,Lao ,ພາສາລາວ -lt ,lit ,lit ,Lithuanian ,lietuvių kalba -lu ,lub ,lub ,Luba-Katanga ,Tshiluba -lv ,lav ,lav ,Latvian ,latviešu valoda -mg ,mlg ,mlg ,Malagasy ,fiteny malagasy -mh ,mah ,mah ,Marshallese ,Kajin M̧ajeļ -mi ,mri ,mao ,Māori ,te reo Māori -mk ,mkd ,mac ,Macedonian ,македонски јазик -ml ,mal ,mal ,Malayalam ,മലയാളം -mn ,mon ,mon ,Mongolian ,монгол -mr ,mar ,mar ,Marathi (Marāṭhī) ,मराठी -ms ,msa ,may ,Malay ,"bahasa Melayu, بهاس ملايو‎ " -mt ,mlt ,mlt ,Maltese ,Malti -my ,mya ,bur ,Burmese ,ဗမာစာ -na ,nau ,nau ,Nauru ,Ekakairũ Naoero -nb ,nob ,nob ,Norwegian Bokmål ,Norsk bokmål -nd ,nde ,nde ,North Ndebele ,isiNdebele -ne ,nep ,nep ,Nepali ,नेपाली -ng ,ndo ,ndo ,Ndonga ,Owambo -nl ,nld ,dut ,Dutch ,"Nederlands, Vlaams " -nn ,nno ,nno ,Norwegian Nynorsk ,Norsk nynorsk -no ,nor ,nor ,Norwegian ,Norsk -nr ,nbl ,nbl ,South Ndebele ,isiNdebele -nv ,nav ,nav ,"Navajo, Navaho ","Diné bizaad, Dinékʼehǰí " -ny ,nya ,nya ,Chichewa; Chewa; Nyanja ,"chiCheŵa, chinyanja " -oc ,oci ,oci ,Occitan ,"occitan, lenga d'òc " -oj ,oji ,oji ,"Ojibwe, Ojibwa ",ᐊᓂᔑᓈᐯᒧᐎᓐ -om ,orm ,orm ,Oromo ,Afaan Oromoo -or ,ori ,ori ,Oriya ,ଓଡ଼ିଆ -os ,oss ,oss ,"Ossetian, Ossetic ",ирон æвзаг -pa ,pan ,pan ,"Panjabi, Punjabi ","ਪੰਜਾਬੀ, پنجابی‎ " -pi ,pli ,pli ,Pāli ,पाऴि -pl ,pol ,pol ,Polish ,"język polski, polszczyzna " -ps ,pus ,pus ,"Pashto, Pushto ",پښتو -pt ,por ,por ,Portuguese ,português -qu ,que ,que ,Quechua ,"Runa Simi, Kichwa " -rm ,roh ,roh ,Romansh ,rumantsch grischun -rn ,run ,run ,Kirundi ,Ikirundi -ro ,ron ,rum ,Romanian ,limba română -ru ,rus ,rus ,Russian ,русский язык -rw ,kin ,kin ,Kinyarwanda ,Ikinyarwanda -sa ,san ,san ,Sanskrit (Saṁskṛta) ,संस्कृतम् -sc ,srd ,srd ,Sardinian ,sardu -sd ,snd ,snd ,Sindhi ,"सिन्धी, سنڌي، سندھی‎ " -se ,sme ,sme ,Northern Sami ,Davvisámegiella -sg ,sag ,sag ,Sango ,yângâ tî sängö -si ,sin ,sin ,"Sinhala, Sinhalese ",සිංහල -sk ,slk ,slo ,Slovak ,"slovenčina, slovenský jazyk " -sl ,slv ,slv ,Slovene ,"slovenski jezik, slovenščina " -sm ,smo ,smo ,Samoan ,gagana fa'a Samoa -sn ,sna ,sna ,Shona ,chiShona -so ,som ,som ,Somali ,"Soomaaliga, af Soomaali " -sq ,sqi ,alb ,Albanian ,gjuha shqipe -sr ,srp ,srp ,Serbian ,српски језик -ss ,ssw ,ssw ,Swati ,SiSwati -st ,sot ,sot ,Southern Sotho ,Sesotho -su ,sun ,sun ,Sundanese ,Basa Sunda -sv ,swe ,swe ,Swedish ,Svenska -sw ,swa ,swa ,Swahili ,Kiswahili -ta ,tam ,tam ,Tamil ,தமிழ் -te ,tel ,tel ,Telugu ,తెలుగు -tg ,tgk ,tgk ,Tajik ,"тоҷикӣ, toğikī, تاجیکی‎ " -th ,tha ,tha ,Thai ,ไทย -ti ,tir ,tir ,Tigrinya ,ትግርኛ -tk ,tuk ,tuk ,Turkmen ,"Türkmen, Түркмен " -tl ,tgl ,tgl ,Tagalog ,"Wikang Tagalog, ᜏᜒᜃᜅ᜔ ᜆᜄᜎᜓᜄ᜔ " -tn ,tsn ,tsn ,Tswana ,Setswana -to ,ton ,ton ,Tonga (Tonga Islands) ,faka Tonga -tr ,tur ,tur ,Turkish ,Türkçe -ts ,tso ,tso ,Tsonga ,Xitsonga -tt ,tat ,tat ,Tatar ,"татар теле, tatar tele " -tw ,twi ,twi ,Twi ,Twi -ty ,tah ,tah ,Tahitian ,Reo Tahiti -ug ,uig ,uig ,"Uyghur, Uighur ","Uyƣurqə, ئۇيغۇرچە‎ " -uk ,ukr ,ukr ,Ukrainian ,українська мова -ur ,urd ,urd ,Urdu ,اردو -uz ,uzb ,uzb ,Uzbek ,"O‘zbek, Ўзбек, أۇزبېك‎ " -ve ,ven ,ven ,Venda ,Tshivenḓa -vi ,vie ,vie ,Vietnamese ,Tiếng Việt -vo ,vol ,vol ,Volapük ,Volapük -wa ,wln ,wln ,Walloon ,walon -wo ,wol ,wol ,Wolof ,Wollof -xh ,xho ,xho ,Xhosa ,isiXhosa -yi ,yid ,yid ,Yiddish ,ייִדיש -yo ,yor ,yor ,Yoruba ,Yorùbá -za ,zha ,zha ,"Zhuang, Chuang ","Saɯ cueŋƅ, Saw cuengh " -zh ,zho ,chi ,Chinese ,"中文 (Zhōngwén), 汉语, 漢語 " -zu ,zul ,zul ,Zulu ,isiZulu \ No newline at end of file diff --git a/localehelper.cpp b/localehelper.cpp index 92c2e4f..e963dbc 100644 --- a/localehelper.cpp +++ b/localehelper.cpp @@ -9,10 +9,10 @@ using namespace std::literals; namespace TagParser { /// \cond -static const auto &languageMapping() +static const auto &languageNames_ISO_639_2_b() { -#include "resources/languages.h" - return languages; +#include "resources/iso_language_codes.h" + return languageNames_iso_639_2_b; } /// \endcond @@ -30,7 +30,7 @@ inline static bool isLanguageDefined_ISO_639_2(const std::string &languageSpecif */ static const std::string &languageName_ISO_639_2(const std::string &isoCode) { - const auto &mapping = languageMapping(); + const auto &mapping = languageNames_ISO_639_2_b(); const auto i = mapping.find(isoCode); if (i == mapping.cend()) { static const std::string empty;