From 6e9b39726d9e6fad2871c994a9f481d9f2c2d56a Mon Sep 17 00:00:00 2001 From: Martchus Date: Sat, 17 Aug 2019 20:56:09 +0200 Subject: [PATCH] Add conversion from ISO-639-2/B codes to language names --- CMakeLists.txt | 16 ++++ abstracttrack.cpp | 5 +- language.cpp | 44 ++++++++++ language.h | 26 ++++++ languages.csv | 186 ++++++++++++++++++++++++++++++++++++++++ mediafileinfo.cpp | 6 +- tests/mediafileinfo.cpp | 2 +- 7 files changed, 279 insertions(+), 6 deletions(-) create mode 100644 language.cpp create mode 100644 language.h create mode 100644 languages.csv diff --git a/CMakeLists.txt b/CMakeLists.txt index 1f8afaf..1192877 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,6 +47,7 @@ set(HEADER_FILES id3/id3v2tag.h ivf/ivfframe.h ivf/ivfstream.h + language.h localeawarestring.h margin.h matroska/ebmlelement.h @@ -121,6 +122,7 @@ set(SRC_FILES id3/id3v2tag.cpp ivf/ivfframe.cpp ivf/ivfstream.cpp + language.cpp localeawarestring.cpp matroska/ebmlelement.cpp matroska/matroskaattachment.cpp @@ -200,3 +202,17 @@ include(LibraryTarget) include(TestTarget) include(Doxygen) include(ConfigHeader) + +# write languages header from CSV file +set(LANGUAGES_HEADER "static const std::unordered_map languages = \{") +file(STRINGS languages.csv LANGUAGE_ROWS ENCODING UTF-8) +foreach (LANGUAGE_ROW ${LANGUAGE_ROWS}) + if (NOT LANGUAGE_ROW MATCHES "([a-z][a-z]) ,([a-z][a-z][a-z]) ,([a-z][a-z][a-z]) ,\"?([^\",]*) \"?,\"?([^\",]*) \"?") + continue() + endif () + set(LANGUAGE_ABBREVIATION "${CMAKE_MATCH_3}") + set(LANGUAGE_NAME "${CMAKE_MATCH_4}") + set(LANGUAGES_HEADER "${LANGUAGES_HEADER}\n \{\"${LANGUAGE_ABBREVIATION}\", \"${LANGUAGE_NAME}\"\},") +endforeach () +set(LANGUAGES_HEADER "${LANGUAGES_HEADER}\n};") +file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/resources/languages.h" "${LANGUAGES_HEADER}") diff --git a/abstracttrack.cpp b/abstracttrack.cpp index 7c05d0c..92c1fbc 100644 --- a/abstracttrack.cpp +++ b/abstracttrack.cpp @@ -1,5 +1,6 @@ #include "./abstracttrack.h" #include "./exceptions.h" +#include "./language.h" #include "./mediaformat.h" #include "./mp4/mp4ids.h" @@ -137,8 +138,8 @@ string AbstractTrack::label() const if (!name().empty()) { ss << ", name: \"" << name() << "\""; } - if (!language().empty() && language() != "und") { - ss << ", language: \"" << language() << "\""; + if (isLanguageDefined(language())) { + ss << ", language: " << languageNameFromIsoWithFallback(language()) << ""; } return ss.str(); } diff --git a/language.cpp b/language.cpp new file mode 100644 index 0000000..24fea30 --- /dev/null +++ b/language.cpp @@ -0,0 +1,44 @@ +#include "./language.h" + +#include + +namespace TagParser { + +/// \cond +static const auto &languageMapping() +{ +#include "resources/languages.h" + return languages; +} +/// \endcond + +/*! + * \brief Returns the language name for the specified ISO-639-2 code (bibliographic, 639-2/B). + * \remarks If \a isoCode is unknown an empty string is returned. + */ +const std::string &languageNameFromIso(const std::string &isoCode) +{ + const auto &mapping = languageMapping(); + const auto i = mapping.find(isoCode); + if (i == mapping.cend()) { + static const std::string empty; + return empty; + } + return i->second; +} + +/*! + * \brief Returns the language name for the specified ISO-639-2 code (bibliographic, 639-2/B). + * \remarks If \a isoCode is unknown the \a isoCode itself is returned. + */ +const std::string &languageNameFromIsoWithFallback(const std::string &isoCode) +{ + const auto &mapping = languageMapping(); + const auto i = mapping.find(isoCode); + if (i == mapping.cend()) { + return isoCode; + } + return i->second; +} + +} // namespace TagParser diff --git a/language.h b/language.h new file mode 100644 index 0000000..07569e2 --- /dev/null +++ b/language.h @@ -0,0 +1,26 @@ +#ifndef TAG_PARSER_LANGUAGE_H +#define TAG_PARSER_LANGUAGE_H + +#include "./global.h" + +#include + +#include +#include + +namespace TagParser { + +/*! + * \brief Returns whether \a languageSpecification is not empty or undefined. + */ +inline bool isLanguageDefined(const std::string &languageSpecification) +{ + return !languageSpecification.empty() && languageSpecification != "und"; +} + +TAG_PARSER_EXPORT const std::string &languageNameFromIso(const std::string &isoCode); +TAG_PARSER_EXPORT const std::string &languageNameFromIsoWithFallback(const std::string &isoCode); + +} // namespace TagParser + +#endif // TAG_PARSER_LANGUAGE_H diff --git a/languages.csv b/languages.csv new file mode 100644 index 0000000..8010dbd --- /dev/null +++ b/languages.csv @@ -0,0 +1,186 @@ +639-1 ,639-2/T ,639-2/B ,Language name ,Native name +aa ,aar ,aar ,Afar ,Afaraf +ab ,abk ,abk ,Abkhaz ,"аҧсуа бызшәа, аҧсшәа " +ae ,ave ,ave ,Avestan ,avesta +af ,afr ,afr ,Afrikaans ,Afrikaans +ak ,aka ,aka ,Akan ,Akan +am ,amh ,amh ,Amharic ,አማርኛ +an ,arg ,arg ,Aragonese ,aragonés +ar ,ara ,ara ,Arabic ,العربية +as ,asm ,asm ,Assamese ,অসমীয়া +av ,ava ,ava ,Avaric ,"авар мацӀ, магӀарул мацӀ " +ay ,aym ,aym ,Aymara ,aymar aru +az ,aze ,aze ,Azerbaijani ,azərbaycan dili +az ,azb ,azb ,South Azerbaijani ,تورکجه‎ +ba ,bak ,bak ,Bashkir ,башҡорт теле +be ,bel ,bel ,Belarusian ,беларуская мова +bg ,bul ,bul ,Bulgarian ,български език +bh ,bih ,bih ,Bihari ,भोजपुरी +bi ,bis ,bis ,Bislama ,Bislama +bm ,bam ,bam ,Bambara ,bamanankan +bn ,ben ,ben ,Bengali; Bangla ,বাংলা +bo ,bod ,tib ,"Tibetan Standard, Tibetan, Central ",བོད་ཡིག +br ,bre ,bre ,Breton ,brezhoneg +bs ,bos ,bos ,Bosnian ,bosanski jezik +ca ,cat ,cat ,Catalan; Valencian ,"català, valencià " +ce ,che ,che ,Chechen ,нохчийн мотт +ch ,cha ,cha ,Chamorro ,Chamoru +co ,cos ,cos ,Corsican ,"corsu, lingua corsa " +cr ,cre ,cre ,Cree ,ᓀᐦᐃᔭᐍᐏᐣ +cs ,ces ,cze ,Czech ,"čeština, český jazyk " +cu ,chu ,chu ,"Old Church Slavonic, Church Slavonic, Old Bulgarian ",ѩзыкъ словѣньскъ +cv ,chv ,chv ,Chuvash ,чӑваш чӗлхи +cy ,cym ,wel ,Welsh ,Cymraeg +da ,dan ,dan ,Danish ,dansk +de ,deu ,ger ,German ,Deutsch +dv ,div ,div ,Divehi; Dhivehi; Maldivian; ,ދިވެހި +dz ,dzo ,dzo ,Dzongkha ,རྫོང་ཁ +ee ,ewe ,ewe ,Ewe ,Eʋegbe +el ,ell ,gre ,"Greek, Modern ",ελληνικά +en ,eng ,eng ,English ,English +eo ,epo ,epo ,Esperanto ,Esperanto +es ,spa ,spa ,Spanish; Castilian ,"español, castellano " +et ,est ,est ,Estonian ,"eesti, eesti keel " +eu ,eus ,baq ,Basque ,"euskara, euskera " +fa ,fas ,per ,Persian (Farsi) ,فارسی +ff ,ful ,ful ,Fula; Fulah; Pulaar; Pular ,"Fulfulde, Pulaar, Pular " +fi ,fin ,fin ,Finnish ,"suomi, suomen kieli " +fj ,fij ,fij ,Fijian ,vosa Vakaviti +fo ,fao ,fao ,Faroese ,føroyskt +fr ,fra ,fre ,French ,"français, langue française " +fy ,fry ,fry ,Western Frisian ,Frysk +ga ,gle ,gle ,Irish ,Gaeilge +gd ,gla ,gla ,Scottish Gaelic; Gaelic ,Gàidhlig +gl ,glg ,glg ,Galician ,galego +gn ,grn ,grn ,Guaraní ,Avañe'ẽ +gu ,guj ,guj ,Gujarati ,ગુજરાતી +gv ,glv ,glv ,Manx ,"Gaelg, Gailck " +ha ,hau ,hau ,Hausa ,"Hausa, هَوُسَ " +he ,heb ,heb ,Hebrew (modern) ,עברית +hi ,hin ,hin ,Hindi ,"हिन्दी, हिंदी " +ho ,hmo ,hmo ,Hiri Motu ,Hiri Motu +hr ,hrv ,hrv ,Croatian ,hrvatski jezik +ht ,hat ,hat ,Haitian; Haitian Creole ,Kreyòl ayisyen +hu ,hun ,hun ,Hungarian ,magyar +hy ,hye ,arm ,Armenian ,Հայերեն +hz ,her ,her ,Herero ,Otjiherero +ia ,ina ,ina ,Interlingua ,Interlingua +id ,ind ,ind ,Indonesian ,Bahasa Indonesia +ie ,ile ,ile ,Interlingue ,Originally called Occidental; then Interlingue after WWII +ig ,ibo ,ibo ,Igbo ,Asụsụ Igbo +ii ,iii ,iii ,Nuosu ,ꆈꌠ꒿ Nuosuhxop +ik ,ipk ,ipk ,Inupiaq ,"Iñupiaq, Iñupiatun " +io ,ido ,ido ,Ido ,Ido +is ,isl ,ice ,Icelandic ,Íslenska +it ,ita ,ita ,Italian ,italiano +iu ,iku ,iku ,Inuktitut ,ᐃᓄᒃᑎᑐᑦ +ja ,jpn ,jpn ,Japanese ,日本語 (にほんご) +jv ,jav ,jav ,Javanese ,basa Jawa +ka ,kat ,geo ,Georgian ,ქართული +kg ,kon ,kon ,Kongo ,KiKongo +ki ,kik ,kik ,"Kikuyu, Gikuyu ",Gĩkũyũ +kj ,kua ,kua ,"Kwanyama, Kuanyama ",Kuanyama +kk ,kaz ,kaz ,Kazakh ,қазақ тілі +kl ,kal ,kal ,"Kalaallisut, Greenlandic ","kalaallisut, kalaallit oqaasii " +km ,khm ,khm ,Khmer ,"ខ្មែរ, ខេមរភាសា, ភាសាខ្មែរ " +kn ,kan ,kan ,Kannada ,ಕನ್ನಡ +ko ,kor ,kor ,Korean ,"한국어 (韓國語), 조선어 (朝鮮語) " +kr ,kau ,kau ,Kanuri ,Kanuri +ks ,kas ,kas ,Kashmiri ,"कश्मीरी, كشميري‎ " +ku ,kur ,kur ,Kurdish ,"Kurdî, كوردی‎ " +kv ,kom ,kom ,Komi ,коми кыв +kw ,cor ,cor ,Cornish ,Kernewek +ky ,kir ,kir ,Kyrgyz ,"Кыргызча, Кыргыз тили " +la ,lat ,lat ,Latin ,"latine, lingua latina " +lb ,ltz ,ltz ,"Luxembourgish, Letzeburgesch ",Lëtzebuergesch +lg ,lug ,lug ,Ganda ,Luganda +li ,lim ,lim ,"Limburgish, Limburgan, Limburger ",Limburgs +ln ,lin ,lin ,Lingala ,Lingála +lo ,lao ,lao ,Lao ,ພາສາລາວ +lt ,lit ,lit ,Lithuanian ,lietuvių kalba +lu ,lub ,lub ,Luba-Katanga ,Tshiluba +lv ,lav ,lav ,Latvian ,latviešu valoda +mg ,mlg ,mlg ,Malagasy ,fiteny malagasy +mh ,mah ,mah ,Marshallese ,Kajin M̧ajeļ +mi ,mri ,mao ,Māori ,te reo Māori +mk ,mkd ,mac ,Macedonian ,македонски јазик +ml ,mal ,mal ,Malayalam ,മലയാളം +mn ,mon ,mon ,Mongolian ,монгол +mr ,mar ,mar ,Marathi (Marāṭhī) ,मराठी +ms ,msa ,may ,Malay ,"bahasa Melayu, بهاس ملايو‎ " +mt ,mlt ,mlt ,Maltese ,Malti +my ,mya ,bur ,Burmese ,ဗမာစာ +na ,nau ,nau ,Nauru ,Ekakairũ Naoero +nb ,nob ,nob ,Norwegian Bokmål ,Norsk bokmål +nd ,nde ,nde ,North Ndebele ,isiNdebele +ne ,nep ,nep ,Nepali ,नेपाली +ng ,ndo ,ndo ,Ndonga ,Owambo +nl ,nld ,dut ,Dutch ,"Nederlands, Vlaams " +nn ,nno ,nno ,Norwegian Nynorsk ,Norsk nynorsk +no ,nor ,nor ,Norwegian ,Norsk +nr ,nbl ,nbl ,South Ndebele ,isiNdebele +nv ,nav ,nav ,"Navajo, Navaho ","Diné bizaad, Dinékʼehǰí " +ny ,nya ,nya ,Chichewa; Chewa; Nyanja ,"chiCheŵa, chinyanja " +oc ,oci ,oci ,Occitan ,"occitan, lenga d'òc " +oj ,oji ,oji ,"Ojibwe, Ojibwa ",ᐊᓂᔑᓈᐯᒧᐎᓐ +om ,orm ,orm ,Oromo ,Afaan Oromoo +or ,ori ,ori ,Oriya ,ଓଡ଼ିଆ +os ,oss ,oss ,"Ossetian, Ossetic ",ирон æвзаг +pa ,pan ,pan ,"Panjabi, Punjabi ","ਪੰਜਾਬੀ, پنجابی‎ " +pi ,pli ,pli ,Pāli ,पाऴि +pl ,pol ,pol ,Polish ,"język polski, polszczyzna " +ps ,pus ,pus ,"Pashto, Pushto ",پښتو +pt ,por ,por ,Portuguese ,português +qu ,que ,que ,Quechua ,"Runa Simi, Kichwa " +rm ,roh ,roh ,Romansh ,rumantsch grischun +rn ,run ,run ,Kirundi ,Ikirundi +ro ,ron ,rum ,Romanian ,limba română +ru ,rus ,rus ,Russian ,русский язык +rw ,kin ,kin ,Kinyarwanda ,Ikinyarwanda +sa ,san ,san ,Sanskrit (Saṁskṛta) ,संस्कृतम् +sc ,srd ,srd ,Sardinian ,sardu +sd ,snd ,snd ,Sindhi ,"सिन्धी, سنڌي، سندھی‎ " +se ,sme ,sme ,Northern Sami ,Davvisámegiella +sg ,sag ,sag ,Sango ,yângâ tî sängö +si ,sin ,sin ,"Sinhala, Sinhalese ",සිංහල +sk ,slk ,slo ,Slovak ,"slovenčina, slovenský jazyk " +sl ,slv ,slv ,Slovene ,"slovenski jezik, slovenščina " +sm ,smo ,smo ,Samoan ,gagana fa'a Samoa +sn ,sna ,sna ,Shona ,chiShona +so ,som ,som ,Somali ,"Soomaaliga, af Soomaali " +sq ,sqi ,alb ,Albanian ,gjuha shqipe +sr ,srp ,srp ,Serbian ,српски језик +ss ,ssw ,ssw ,Swati ,SiSwati +st ,sot ,sot ,Southern Sotho ,Sesotho +su ,sun ,sun ,Sundanese ,Basa Sunda +sv ,swe ,swe ,Swedish ,Svenska +sw ,swa ,swa ,Swahili ,Kiswahili +ta ,tam ,tam ,Tamil ,தமிழ் +te ,tel ,tel ,Telugu ,తెలుగు +tg ,tgk ,tgk ,Tajik ,"тоҷикӣ, toğikī, تاجیکی‎ " +th ,tha ,tha ,Thai ,ไทย +ti ,tir ,tir ,Tigrinya ,ትግርኛ +tk ,tuk ,tuk ,Turkmen ,"Türkmen, Түркмен " +tl ,tgl ,tgl ,Tagalog ,"Wikang Tagalog, ᜏᜒᜃᜅ᜔ ᜆᜄᜎᜓᜄ᜔ " +tn ,tsn ,tsn ,Tswana ,Setswana +to ,ton ,ton ,Tonga (Tonga Islands) ,faka Tonga +tr ,tur ,tur ,Turkish ,Türkçe +ts ,tso ,tso ,Tsonga ,Xitsonga +tt ,tat ,tat ,Tatar ,"татар теле, tatar tele " +tw ,twi ,twi ,Twi ,Twi +ty ,tah ,tah ,Tahitian ,Reo Tahiti +ug ,uig ,uig ,"Uyghur, Uighur ","Uyƣurqə, ئۇيغۇرچە‎ " +uk ,ukr ,ukr ,Ukrainian ,українська мова +ur ,urd ,urd ,Urdu ,اردو +uz ,uzb ,uzb ,Uzbek ,"O‘zbek, Ўзбек, أۇزبېك‎ " +ve ,ven ,ven ,Venda ,Tshivenḓa +vi ,vie ,vie ,Vietnamese ,Tiếng Việt +vo ,vol ,vol ,Volapük ,Volapük +wa ,wln ,wln ,Walloon ,walon +wo ,wol ,wol ,Wolof ,Wollof +xh ,xho ,xho ,Xhosa ,isiXhosa +yi ,yid ,yid ,Yiddish ,ייִדיש +yo ,yor ,yor ,Yoruba ,Yorùbá +za ,zha ,zha ,"Zhuang, Chuang ","Saɯ cueŋƅ, Saw cuengh " +zh ,zho ,chi ,Chinese ,"中文 (Zhōngwén), 汉语, 漢語 " +zu ,zul ,zul ,Zulu ,isiZulu \ No newline at end of file diff --git a/mediafileinfo.cpp b/mediafileinfo.cpp index 1f4ce9a..30c032c 100644 --- a/mediafileinfo.cpp +++ b/mediafileinfo.cpp @@ -3,6 +3,7 @@ #include "./backuphelper.h" #include "./diagnostics.h" #include "./exceptions.h" +#include "./language.h" #include "./progressfeedback.h" #include "./signature.h" #include "./tag.h" @@ -902,12 +903,11 @@ unordered_set MediaFileInfo::availableLanguages(MediaType type) const if (m_container) { for (size_t i = 0, count = m_container->trackCount(); i != count; ++i) { const AbstractTrack *track = m_container->track(i); - if ((type == MediaType::Unknown || track->mediaType() == type) && !track->language().empty() && track->language() != "und") { + if ((type == MediaType::Unknown || track->mediaType() == type) && isLanguageDefined(track->language())) { res.emplace(track->language()); } } - } else if (m_singleTrack && (type == MediaType::Unknown || m_singleTrack->mediaType() == type) && !m_singleTrack->language().empty() - && m_singleTrack->language() != "und") { + } else if (m_singleTrack && (type == MediaType::Unknown || m_singleTrack->mediaType() == type) && isLanguageDefined(m_singleTrack->language())) { res.emplace(m_singleTrack->language()); } return res; diff --git a/tests/mediafileinfo.cpp b/tests/mediafileinfo.cpp index e9506b2..76bd2c0 100644 --- a/tests/mediafileinfo.cpp +++ b/tests/mediafileinfo.cpp @@ -186,6 +186,6 @@ void MediaFileInfoTests::testFullParseAndFurtherProperties() CPPUNIT_ASSERT_EQUAL(unordered_set({ "eng" }), file.availableLanguages()); CPPUNIT_ASSERT_EQUAL(unordered_set({}), file.availableLanguages(MediaType::Text)); CPPUNIT_ASSERT_EQUAL("ID: 2422994868, type: Video"s, file.tracks()[0]->label()); - CPPUNIT_ASSERT_EQUAL("ID: 3653291187, type: Audio, language: \"eng\""s, file.tracks()[1]->label()); + CPPUNIT_ASSERT_EQUAL("ID: 3653291187, type: Audio, language: English"s, file.tracks()[1]->label()); CPPUNIT_ASSERT_EQUAL("MS-MPEG-4-480p / MP3-2ch-eng"s, file.technicalSummary()); }