This is the Perl script I use to generate various Wikimedia Unicode character roadmaps.
It creates multilingual images for these Unicode planes:
- Roadmap_to_Unicode_BMP_multilingual.svg (from output file new_svgs/map_BMP_multilingual.svg)
- Roadmap_to_Unicode_SIP_multilingual.svg (from output file new_svgs/map_SIP_multilingual.svg)
- Roadmap_to_Unicode_SMP_multilingual.svg (from output file new_svgs/map_SMP_multilingual.svg)
- Roadmap_to_Unicode_SSP_multilingual.svg (from output file new_svgs/map_SSP_multilingual.svg)
- Roadmap_to_Unicode_TIP_multilingual.svg (from output file new_svgs/map_TIP_multilingual.svg)
As well as the legacy English-only images:
- Roadmap_to_Unicode_BMP.svg (from output file new_svgs/map_BMP_en.svg)
- Roadmap_to_Unicode_SIP.svg (from output file new_svgs/map_SIP_en.svg)
- Roadmap_to_the_Unicode_SMP.svg (from output file new_svgs/map_SMP_en.svg)
- Roadmap_to_Unicode_SSP.svg (from output file new_svgs/map_SSP_en.svg)
- Roadmap_to_Unicode_TIP.svg (from output file new_svgs/map_TIP_en.svg)
The script was based on Saric's script for the BMP Roadmap.
Feel free to use and adapt it or ping me if you'd like to add or correct a translation.
The following languages are supported: English, Belarusian, Chinese using simplified characters, Chinese using traditional characters, Czech, Dutch, French, German, Hungarian, Korean, Persian, Portuguese, Russian, Spanish, Turkish and Ukrainian.
I also have a test page so I can review all of the languages at once.
#!/usr/bin/perl
# last updated 2024-09-11
#
# This script uses Unicode data to create roadmaps of character allocation by plane.
# The groupings more or less match up to the chapters of the Unicode Standard.
#
# Various types of SVG images are created for each named Unicode plane with
# allocated code points using the naming convention new_svgs/map_XXX_variation.svg
# where
# XXX is BMP, SIP, SMP, SSP, or TIP
# variation is "multilingual" (850 by 500px),
# "en" (English legend only, 750px by 750px), or
# "no_legend" (500px by 500px).
#
# SVG images can also be created for the other planes but aren't very interesting
# and shouldn't be added to Wikimedia/Wikipedia.
# If the -all option is specified, the script will generate output
# for the other planes using the naming convention new_svgs/map_Znn_variation.svg
# where nn is the plane number (for example, new_svgs/map_Z03_multilingual.svg).
#
# Note that the multilingual SVG files created by this script use the <switch>
# element and the 'systemLanguage' attribute to provide a legend in multiple
# languages within one SVG file. Wikipedia also allows the language to be
# specified explicitly with the lang parameter: [[File:filename.svg|lang=code]]
# Keep in mind that if the SVG display program doesn't support the allowReorder
# attribute you'll see the FIRST language match not the BEST language match.
# For example, if the web browser language preferences are Dutch (nl) then German (de),
# and it doesn't support allowReorder, you'll see German because "de" comes before "nl"
# in the SVG file.
use utf8;
use feature 'unicode_strings';
use warnings;
use strict;
use Getopt::Long qw(GetOptions);
use POSIX qw(ceil floor);
use SVG;
my $result = GetOptions(qw(all|a test|t=s help|h|?));
our $opt_all = 0 unless $main::opt_all; # print all 17 planes if specified
our $opt_test = "" unless $main::opt_test; # replace English text with this language
our $opt_help = 0 unless $main::opt_help; # show help display and exit
if($opt_help || $result != 1) {
print "Syntax: $0 [-all] [-test lang] [-?]\n";
print " where \"-all\" prints all 17 Unicode planes\n";
print " \"-test lang\" replaces English text with language code \"lang\",\n";
print " draws dashed lines around the legend,\n";
print " and prefaces the output file names with test_lang_\n";
print " \"-?\" shows this helpful information and exits\n";
exit 0;
}
my $outputDirectory = "new_svgs";
mkdir($outputDirectory) unless(-d $outputDirectory);
# this font is used for the legend text:
our $defaultFontFamily = "'DejaVu Sans', sans-serif";
our %fontFamilyOverride = (
# only add a font here if the language cannot be represented with the default fonts above
ko => "'UnJamoDotum', sans-serif",
);
our %isRTL = (
# right-to-left languages:
fa => undef, # Persian
);
our %legendTextByLanguageAndGroup = (
# English
en => {
Africa => "African scripts",
Americas => "American scripts",
AsiaEast => "East Asian scripts",
AsiaSC => "South and\nCentral Asian scripts",
AsiaSE => "Southeast Asian scripts",
asOfVersion => "As of Unicode \%s",
cuneiform => "Cuneiform",
Europe => "Non-Latin European scripts",
Han => "CJK characters",
hieroglyphs => "Hieroglyphs",
IndOcean => "Indonesian and\nOceanic scripts",
Latin => "Latin script",
ME => "Middle Eastern and\nSouthwest Asian scripts",
misc => "Miscellaneous characters",
notation => "Notational systems",
private => "Private use",
surrogates => "UTF-16 surrogates",
symbols => "Symbols",
tags => "Tags",
unallocated => "Unallocated code points",
variation => "Variation Selectors",
},
# Belarusian
be => {
Africa => "Пісьменства Афрыкі",
Americas => "Пісьменства Амерыкі",
AsiaEast => "Пісьменства Усходняй Азіі",
AsiaSC => "Пісьменства Паўднёвай і\nЦэнтральнай Азіі",
AsiaSE => "Пісьменства Паўднёва-Усходняй Азіі",
asOfVersion => "Па стане на версію Унікода %s",
cuneiform => "Клінапіс",
Europe => "Нелацінскія еўрапейскія пісьменства",
Han => "Ідэаграмы ККЯ",
hieroglyphs => "Іерогліфы",
IndOcean => "Пісьменства Інданезіі і Акіяніі",
Latin => "Лацінская пісьменнасць",
ME => "Пісьменства Сярэдняга Усходу і\nПаўднёва-Заходняй Азіі",
misc => "Розныя сімвалы",
notation => "Сістэмы нотапісу",
private => "Вобласць для прыватнага выкарыстання",
surrogates => "Сурагатныя пары UTF-16",
symbols => "Знакі",
tags => "Тэгі",
unallocated => "Свабодныя кодавыя пазіцыі",
variation => "Варыянтныя селектары",
},
# Chinese using simplified characters
"zh-cmn-Hans,zh-Hans,zh-CN" => {
Africa => "非洲文字",
Americas => "美洲文字",
AsiaEast => "东亚文字",
AsiaSC => "南亚及中亚文字",
AsiaSE => "东南亚文字",
asOfVersion => "目前版本为Unicode %s",
cuneiform => "楔形文字",
Europe => "非拉丁欧洲文字",
Han => "中日韩汉字",
hieroglyphs => "象形文字",
IndOcean => "印度尼西亚及大洋洲文字",
Latin => "拉丁文字",
ME => "中东及西南亚文字",
misc => "杂项字符",
notation => "符号系统",
private => "私人使用区",
surrogates => "UTF-16代理区",
symbols => "符号",
tags => "标签",
unallocated => "未分配代码点",
variation => "变体选择符",
},
# Chinese using traditional characters
"zh-cmn-Hant,zh-Hant,zh-TW" => {
Africa => "非洲文字",
Americas => "美洲文字",
AsiaEast => "東亞文字",
AsiaSC => "南亞及中亞文字",
AsiaSE => "東南亞文字",
asOfVersion => "目前版本為Unicode %s",
cuneiform => "楔形文字",
Europe => "非拉丁歐洲文字",
Han => "中日韓漢字",
hieroglyphs => "象形文字",
IndOcean => "印度尼西亞及大洋洲文字",
Latin => "拉丁文字",
ME => "中東及西南亞文字",
misc => "雜項字符",
notation => "符號系統",
private => "私人使用區",
surrogates => "UTF-16代理區",
symbols => "符號",
tags => "標籤",
unallocated => "未分配代碼點",
variation => "變體選擇符",
},
# Czech
cs => {
Africa => "Africká písma",
Americas => "Americká písma",
AsiaEast => "Východoasijská písma",
AsiaSC => "Písma jižní a střední Asie",
AsiaSE => "Písma jihovýchodní Asie",
asOfVersion => "V Unicode %s",
cuneiform => "Klínové písmo",
Europe => "Nelatinková evropská písma",
Han => "Čínština, japonština a korejština",
hieroglyphs => "Hieroglyfy",
IndOcean => "Písma Indonésie a Oceánie",
Latin => "Latinka",
ME => "Písma Blízkého a Středního východu",
misc => "Různé znaky",
notation => "Notační systémy",
private => "Pro soukromé použití",
surrogates => "Náhradní páry UTF-16 (surrogate pairs)",
symbols => "Symboly",
tags => "Jmenovky (tags)",
unallocated => "Nepřidělené kódové body",
variation => "Selektory variant",
},
# Dutch
nl => {
Africa => "Afrikaanse schriften",
Americas => "Noord- en Zuid-Amerikaanse schriften",
AsiaEast => "Oost-Aziatische schriften",
AsiaSC => "Zuid- en Centraal-Aziatische schriften",
AsiaSE => "Zuidoost-Aziatische schriften",
asOfVersion => "Geldig voor Unicode \%s",
cuneiform => "Spijkerschrift",
Europe => "Niet-Latijnse Europese schriften",
Han => "CJK-karakters",
hieroglyphs => "Hiërogliefen",
IndOcean => "Indonesische en Oceanische schriften",
Latin => "Latijnse schriften",
ME => "Midden-Oosterse en\nZuidwest-Aziatische schriften",
misc => "Diverse karakters",
notation => "Notatiesystemen",
private => "Privégebruik",
surrogates => "UTF-16-plaatsvervangers",
symbols => "Symbolen",
tags => "Tags",
unallocated => "Niet toegekend",
variation => "Variantkeuzes",
},
# French
fr => {
Africa => "Écritures africaines",
Americas => "Écriture américaine",
AsiaEast => "Écriture de l'Asie de l'Est",
AsiaSC => "Écriture de l'Asie centrale et du Sud",
AsiaSE => "Écriture de l'Asie du Sud-Est",
asOfVersion => "À partir d'Unicode %s",
cuneiform => "Cunéiforme",
Europe => "Écriture européenne non latine",
Han => "Caractères CJK",
hieroglyphs => "Hiéroglyphes",
IndOcean => "Écritures indonésiennes\net océaniennes",
Latin => "Écriture latine",
ME => "Écriture du Moyen-Orient\net de l'Asie du Sud-Ouest",
misc => "Caractères divers",
notation => "Systèmes de notation",
private => "Usage privé",
surrogates => "Seizet d'indirection de l'UTF-16",
symbols => "Symboles",
tags => "Étiquettes",
unallocated => "Points de code non alloués",
variation => "Sélecteurs de variations",
},
# German
de => {
Africa => "Afrikanische Schriften",
Americas => "Amerikanische Schriften",
AsiaEast => "Ostasiatische Schriften",
AsiaSC => "Süd- und Mittelasiatische\nSchriften",
AsiaSE => "Südostasiatische Schriften",
asOfVersion => "Stand: Unicode \%s",
cuneiform => "Keilschrift",
Europe => "Andere europäische Schriften",
Han => "CJK-Ideogramme",
hieroglyphs => "Hieroglyphen",
IndOcean => "Indonesische und ozeanische\nSchriften",
Latin => "Lateinische Schriften und Symbole",
ME => "Nahost- und Südwestasiatische\nSchriften",
misc => "Verschiedene Zeichen",
notation => "Notationssysteme",
private => "Privater Nutzungsbereich",
surrogates => "UTF-16-Surrogates",
symbols => "Symbole",
tags => "Tags",
unallocated => "Nicht belegte Codebereiche",
variation => "Variantenselektoren",
},
# Hungarian
hu => {
Africa => "Afrikai írásrendszerek",
Americas => "Amerikai írásrendszerek",
AsiaEast => "Kelet-ázsiai írásrendszerek",
AsiaSC => "Dél- és közép-ázsiai írásrendszerek",
AsiaSE => "Délkelet-ázsiai írásrendszerek",
asOfVersion => "A Unicode \%s szerint",
cuneiform => "Ékírás",
Europe => "Nem latin betűs\neurópai írásrendszerek",
Han => "CJK (kínai, japán, koreai) karakterek",
hieroglyphs => "Hieroglifák",
IndOcean => "Indonéziai és óceániai\nírásrendszerek",
Latin => "Latin betűs írás",
ME => "Közel-keleti és délnyugat-\názsiai írásrendszerek",
misc => "Egyéb karakterek",
notation => "Jelölésrendszerek",
private => "Saját használatú terület",
surrogates => "UTF-16-helyettesítők",
symbols => "Szimbólumok",
tags => "Címkék",
unallocated => "Nem használt kódpontok",
variation => "Variációválasztók",
},
# Korean
ko => {
Africa => "아프리카 문자",
Americas => "북미 및 남미 문자",
AsiaEast => "동아시아 문자",
AsiaSC => "남부와 중앙 아시아 문자",
AsiaSE => "동남아시아 문자",
asOfVersion => "유니 코드 버전 \%s",
cuneiform => "쐐기 문자",
Europe => "기타 유럽 문자",
Han => "CJK 문자",
hieroglyphs => "상형 문자",
IndOcean => "인도네시아, 오세아니아 문자",
Latin => "로마자, 로마자권 기호",
ME => "중동·서남아시아 문자",
misc => "기타 문자",
notation => "Notational systems",
private => "사용자 정의 영역",
surrogates => "UTF-16 상·하위 대체 영역",
symbols => "기호",
tags => "Tags",
unallocated => "쓰이지 않음",
variation => "Variation Selectors",
},
# Persian
fa => {
Africa => "خطهای آفریقایی",
Americas => "خطهای آمریکایی",
AsiaEast => "خطهای آسیای شرقی",
AsiaSC => "خطهای جنوب آسیا و آسیای میانه",
AsiaSE => "خطهای جنوب شرق آسیا",
asOfVersion => "تا یونیکد \%s",
cuneiform => "خط میخی",
Europe => "خطهای اروپایی غیر لاتین",
Han => "اندیشهنگاریهای CJK",
hieroglyphs => "هیروگلیفها",
IndOcean => "خطهای اندونزی و اقیانوسیه",
Latin => "خط لاتین",
ME => "خطهای خاورمیانه و جنوب آسیا",
misc => "نویسههای متفرقه",
notation => "نمادگانها",
private => "کاربرد شخصی",
surrogates => "جایگزینهای UTF-16",
symbols => "نمادها",
tags => "برچسبها",
unallocated => "موقعیتکدهای منتسبنشده",
variation => "انتخابگرهای گلیف",
},
# Portuguese
pt => {
Africa => "Escrita africana",
Americas => "Escrita americana",
AsiaEast => "Escrita da Ásia Oriental",
AsiaSC => "Escrita da Ásia Central\ne do Sul",
AsiaSE => "Escrita do Sudeste Asiático",
asOfVersion => "A partir do Unicode %s",
cuneiform => "Cuneiforme",
Europe => "Escrita europeia não latina",
Han => "Caracteres CJK",
hieroglyphs => "Hieróglifos",
IndOcean => "Escrita indonésia e oceânica",
Latin => "Escrita latina",
ME => "Escrita do Oriente Médio\ne do Sudoeste Asiático",
misc => "Caracteres diversos",
notation => "Sistemas de notação",
private => "Uso privado",
surrogates => "Substitutos do UTF-16",
symbols => "Símbolos",
tags => "Etiquetas",
unallocated => "Pontos de código não atribuídos",
variation => "Seletores de variação",
},
# Russian
ru => {
Africa => "Письменности Африки",
Americas => "Письменности Америки",
AsiaEast => "Письменности Восточной Азии",
AsiaSC => "Письменности Южной и\nЦентральной Азии",
AsiaSE => "Письменности Юго-Восточной Азии",
asOfVersion => "По состоянию на версию Юникода %s",
cuneiform => "Клинопись",
Europe => "Нелатинские европейские письменности",
Han => "Идеограммы ККЯ",
hieroglyphs => "Иероглифы",
IndOcean => "Письменности Индонезии и Океании",
Latin => "Латинская письменность",
ME => "Письменности Среднего Востока и\nЮго-Западной Азии",
misc => "Разные символы",
notation => "Системы нотописи",
private => "Область для частного использования",
surrogates => "Суррогатные пары UTF-16",
symbols => "Знаки",
tags => "Тэги",
unallocated => "Свободные кодовые позиции",
variation => "Вариантные селекторы",
},
# Spanish
es => {
Africa => "Escrituras africanas",
Americas => "Escrituras americanas",
AsiaEast => "Escrituras de Asia Oriental",
AsiaSC => "Escrituras de Asia Meridional\ny Asia Central",
AsiaSE => "Escrituras del Sudeste Asiático",
asOfVersion => "A partir de Unicode %s",
cuneiform => "Cuneiforme",
Europe => "Escrituras europeas no latinas",
Han => "Caracteres CJK",
hieroglyphs => "Jeroglíficos",
IndOcean => "Escrituras indonesias y oceánicas",
Latin => "Escritura latina",
ME => "Escrituras del Oriente Medio\ny del Asia sudoccidental",
misc => "Caracteres varios",
notation => "Sistemas notacionales",
private => "Uso privado",
surrogates => "Sustitutos de UTF-16",
symbols => "Símbolos",
tags => "Etiquetas",
unallocated => "Puntos de código no asignados",
variation => "Selectores de variación",
},
# Turkish
tr => {
Africa => "Afrika yazıları",
Americas => "Amerikan yazını",
AsiaEast => "Doğu Asya yazıları",
AsiaSC => "Güney ve Orta Asya yazıları",
AsiaSE => "Güneydoğu Asya yazını",
asOfVersion => "Unicode %s'den itibaren",
cuneiform => "Çivi Yazısı",
Europe => "Latin olmayan Avrupa yazıları",
Han => "Çince ve Japonca karakterler",
hieroglyphs => "Hiyeroglifler",
IndOcean => "Endonezya ve Okyanusya yazıları",
Latin => "Latince yazı",
ME => "Orta Doğu ve Güneybatı Asya yazını",
misc => "Çeşitli karakterler",
notation => "Notasyonel sistemler",
private => "Özel kullanım",
surrogates => "UTF-16 vekilleri",
symbols => "Semboller",
tags => "Etiketler",
unallocated => "Ayrılmamış kod noktaları",
variation => "Varyasyon Seçiciler",
},
# Ukrainian
uk => {
Africa => "Писемності Африки",
Americas => "Писемності Америки",
AsiaEast => "Писемності Східної Азії",
AsiaSC => "Писемності Південної і\nЦентральної Азії",
AsiaSE => "Писемності Південно-Східної Азії",
asOfVersion => "Станом на версію Юнікоду %s",
cuneiform => "Клинопис",
Europe => "Нелатинські європейські писемності",
Han => "Ідеограми ККЯ",
hieroglyphs => "Ієрогліфи",
IndOcean => "Писемності Індонезії та Океанії",
Latin => "Латинська писемність",
ME => "Писемності Середнього Сходу і\nПівденно-Західної Азії",
misc => "Різні символи",
notation => "Системи нотописі",
private => "Область для приватного використання",
surrogates => "Сурогатні пари UTF-16",
symbols => "Знаки",
tags => "Теги",
unallocated => "Вільні кодові позиції",
variation => "Варіантні селектори",
},
);
if($opt_test) {
# this is just for testing... it causes the specified language to be printed instead of English
if(exists($legendTextByLanguageAndGroup{$opt_test})) {
warn "\nWarning: Overwriting systemLanguage=en values with $opt_test text for testing purposes\n";
$legendTextByLanguageAndGroup{'en'} = $legendTextByLanguageAndGroup{$opt_test}; # test a language
if(exists($isRTL{$opt_test})) {
$isRTL{'en'} = undef; # use same text direction as the specified language
}
} else {
print STDERR "Unsupported language code $opt_test specified on -test option\nSupported language codes:";
foreach (sort customLanguageSort keys %legendTextByLanguageAndGroup) {
print STDERR " $_";
}
print STDERR "\n";
exit 1;
}
}
my $defaultLanguage = 'en';
$legendTextByLanguageAndGroup{'default'} = $legendTextByLanguageAndGroup{$defaultLanguage}; # set the default language
our %legendByPlane;
our %dataByPlane;
our $version = '16.0'; # Unicode version to add to the legend
our %planeAcronymByNumber = (
0, 'BMP',
1, 'SMP',
2, 'SIP',
3, 'TIP',
14, 'SSP',
);
our %planeNameByNumber = (
0, 'Basic Multilingual Plane',
1, 'Supplementary Multilingual Plane',
2, 'Supplementary Ideographic Plane',
3, 'Tertiary Ideographic Plane',
14, 'Supplementary Special-purpose Plane',
);
# Note: Unicode has not published identifying names for planes 15 and 16.
# Chapter 2.8 of the Unicode Standard says "The two Private Use Planes (Planes 15 and 16)",
# while the PUA block names used are Supplementary PUA-A and Supplementary PUA-B.
# --------------- Plane 0 Data (BMP) ---------------
$legendByPlane{0} = [qw(Latin Europe Africa ME AsiaSC AsiaSE AsiaEast Han IndOcean Americas notation symbols private surrogates unallocated asOfVersion)];
# FYI: BMP legend omits "misc" and "variation" groups because they're too small to see.
$dataByPlane{0} =
# abc d group # comment (for a starting code point of U+abcd0)
qq[000 0 Latin
003 0 Europe
005 9 ME
007 8 AsiaSC
007 C Africa
008 0 ME
009 0 AsiaSC
00E 0 AsiaSE
00F 0 AsiaSC
010 0 AsiaSE
010 A Europe
011 0 AsiaEast
012 0 Africa
013 A Americas
016 8 Europe
017 0 IndOcean
017 8 AsiaSE
018 0 AsiaSC
018 B Americas
019 0 AsiaSC
019 5 AsiaSE
01A 0 IndOcean
01A 2 AsiaSE
01A B Europe
01B 0 IndOcean
01C 0 AsiaSC
01C 8 Europe
01C C IndOcean
01C D AsiaSC
01D 0 Latin
01D C Europe
01E 0 Latin
01F 0 Europe
020 7 symbols
028 0 notation
029 0 symbols
02C 0 Europe
02C 6 Latin
02C 8 Europe
02D 3 Africa
02D E Europe
02E 0 symbols
02E 8 Han
02F E unallocated
02F F Han
030 0 AsiaEast
031 C Han
031 F AsiaEast
032 0 symbols
034 0 Han
04D C symbols
04E 0 Han
0A0 0 AsiaEast
0A5 0 Africa
0A6 4 Europe
0A6 A Africa
0A7 0 AsiaEast
0A7 2 Latin
0A8 0 AsiaSC
0A8 3 symbols
0A8 4 AsiaSC
0A9 0 AsiaSE
0A9 3 IndOcean
0A9 6 AsiaEast
0A9 8 IndOcean
0A9 E AsiaSE
0AA E AsiaSC
0AB 0 Africa
0AB 3 Latin
0AB 7 Americas
0AB C AsiaSC
0AC 0 AsiaEast
0D8 0 surrogates
0E0 0 private
0F9 0 Han
0FB 0 Latin
0FB 1 Europe
0FB 2 ME
0FE 0 variation
0FE 1 AsiaEast
0FE 2 Europe
0FE 3 AsiaEast
0FE 7 ME
0FF 0 AsiaEast
0FF F misc];
# --------------- Plane 1 Data (SMP) ---------------
$legendByPlane{1} = [qw(Latin Europe Africa ME AsiaSC AsiaSE AsiaEast IndOcean Americas cuneiform hieroglyphs notation symbols unallocated asOfVersion)];
$dataByPlane{1} =
# abc d group # comment (for a starting code point of U+abcd0)
qq[100 0 Europe
102 0 unallocated
102 8 Europe
103 8 cuneiform
103 E unallocated
104 0 Americas
104 5 Europe
104 8 Africa
104 B Americas
105 0 Europe
107 8 Latin
107 C unallocated
108 0 Europe
108 4 ME
108 B unallocated
108 E ME
109 2 Europe
109 4 unallocated
109 8 hieroglyphs
10A 0 AsiaSC
10A 6 ME
10A A unallocated
10A C ME
10B B unallocated
10C 0 AsiaSC
10C 5 unallocated
10C 8 Europe
10D 0 AsiaSE
10D 4 Africa
10D 9 unallocated
10E 6 symbols
10E 8 ME
10F 0 AsiaSC
10F E ME
110 0 AsiaSC
112 5 unallocated
112 8 AsiaSC
114 E unallocated
115 8 AsiaSC
116 D AsiaSE
117 0 AsiaSC
117 5 unallocated
118 0 AsiaSC
118 5 unallocated
118 A AsiaSC
119 6 unallocated
119 A AsiaSC
11A B Americas
11A C AsiaSE
11B 0 AsiaSC
11B 6 unallocated
11B C AsiaSC
11C C unallocated
11D 0 AsiaSC
11D B unallocated
11E E IndOcean
11F 6 unallocated
11F B AsiaEast
11F C AsiaSC
120 0 cuneiform
125 5 unallocated
12F 9 Europe
130 0 hieroglyphs
146 8 unallocated
161 0 AsiaSC
161 4 unallocated
168 0 Africa
16A 4 AsiaSC
16A D Africa
16B 0 AsiaSE
16B 9 unallocated
16D 4 AsiaSC
16D 8 unallocated
16E 4 Africa
16E A unallocated
16F 0 AsiaEast
16F A unallocated
16F E AsiaEast
18D 8 unallocated
1AF F AsiaEast
1B3 0 unallocated
1BC 0 notation
1BC B unallocated
1CC 0 symbols
1CE C unallocated
1CF 0 notation
1CF D unallocated
1D0 0 notation
1D2 5 unallocated
1D2 C symbols
1D3 8 unallocated
1D4 0 symbols
1D8 0 notation
1DA B unallocated
1DF 0 Latin
1E0 0 Europe
1E0 9 unallocated
1E1 0 AsiaSE
1E1 5 unallocated
1E2 9 AsiaSC
1E3 0 unallocated
1E4 D AsiaSC
1E5 0 unallocated
1E5 D AsiaSC
1E6 0 unallocated
1E7 E Africa
1E8 E unallocated
1E9 0 Africa
1E9 6 unallocated
1EC 7 symbols
1EC C unallocated
1ED 0 symbols
1ED 5 unallocated
1EE 0 symbols
1EF 0 unallocated
1F0 0 symbols
1FC 0 unallocated];
# --------------- Plane 2 Data (SIP) ---------------
$legendByPlane{2} = [qw(Han unallocated asOfVersion)];
$dataByPlane{2} =
# abc d group # comment (for a starting code point of U+abcd0)
qq[200 0 Han
2A6 E unallocated
2A7 0 Han
2EE 6 unallocated
2F8 0 Han
2FA 2 unallocated];
# --------------- Plane 3 Data (TIP) ---------------
$legendByPlane{3} = [qw(Han unallocated asOfVersion)];
$dataByPlane{3} =
# abc d group # comment (for a starting code point of U+abcd0)
qq[300 0 Han
323 B unallocated];
# --------------- Plane 4 Data ---------------
$legendByPlane{4} = [qw(unallocated asOfVersion)];
$dataByPlane{4} =
# abc d group # comment (for a starting code point of U+abcd0)
qq[400 0 unallocated];
# --------------- Plane 5 Data ---------------
$legendByPlane{5} = [qw(unallocated asOfVersion)];
$dataByPlane{5} =
# abc d group # comment (for a starting code point of U+abcd0)
qq[500 0 unallocated];
# --------------- Plane 6 Data ---------------
$legendByPlane{6} = [qw(unallocated asOfVersion)];
$dataByPlane{6} =
# abc d group # comment (for a starting code point of U+abcd0)
qq[600 0 unallocated];
# --------------- Plane 7 Data ---------------
$legendByPlane{7} = [qw(unallocated asOfVersion)];
$dataByPlane{7} =
# abc d group # comment (for a starting code point of U+abcd0)
qq[700 0 unallocated];
# --------------- Plane 8 Data ---------------
$legendByPlane{8} = [qw(unallocated asOfVersion)];
$dataByPlane{8} =
# abc d group # comment (for a starting code point of U+abcd0)
qq[800 0 unallocated];
# --------------- Plane 9 Data ---------------
$legendByPlane{9} = [qw(unallocated asOfVersion)];
$dataByPlane{9} =
# abc d group # comment (for a starting code point of U+abcd0)
qq[900 0 unallocated];
# --------------- Plane 10 Data ---------------
$legendByPlane{10} = [qw(unallocated asOfVersion)];
$dataByPlane{10} =
# abc d group # comment (for a starting code point of U+abcd0)
qq[A00 0 unallocated];
# --------------- Plane 11 Data ---------------
$legendByPlane{11} = [qw(unallocated asOfVersion)];
$dataByPlane{11} =
# abc d group # comment (for a starting code point of U+abcd0)
qq[B00 0 unallocated];
# --------------- Plane 12 Data ---------------
$legendByPlane{12} = [qw(unallocated asOfVersion)];
$dataByPlane{12} =
# abc d group # comment (for a starting code point of U+abcd0)
qq[C00 0 unallocated];
# --------------- Plane 13 Data ---------------
$legendByPlane{13} = [qw(unallocated asOfVersion)];
$dataByPlane{13} =
# abc d group # comment (for a starting code point of U+abcd0)
qq[D00 0 unallocated];
# --------------- Plane 14 Data (SSP) ---------------
$legendByPlane{14} = [qw(tags variation unallocated asOfVersion)];
$dataByPlane{14} =
# abc d group # comment (for a starting code point of U+abcd0)
qq[E00 0 tags
E00 8 unallocated
E01 0 variation
E01 F unallocated];
# --------------- Plane 15 Data ---------------
$legendByPlane{15} = [qw(private asOfVersion)];
$dataByPlane{15} =
# abc d group # comment (for a starting code point of U+abcd0)
qq[F00 0 private];
# --------------- Plane 16 Data ---------------
$legendByPlane{16} = [qw(private asOfVersion)];
$dataByPlane{16} =
# abc d group # comment (for a starting code point of U+abcd0)
qq[000 0 private]; # "000 0" looks wrong but works
# ---------------------------------------------------
# See https://en.wiki.x.io/wiki/Web_colors when selecting new colors
our %legendColorByGroup = (
Africa => 'lightgreen',
Americas => 'yellow',
AsiaEast => 'crimson',
AsiaSC => 'darkgreen',
AsiaSE => 'purple',
cuneiform => 'rosybrown',
Europe => 'blue',
Han => 'salmon',
hieroglyphs => 'cornflowerblue',
IndOcean => 'sienna',
Latin => 'black',
ME => 'orange',
misc => 'pink',
notation => 'cyan',
private => 'darkgray',
surrogates => 'lightgray',
symbols => 'orchid',
tags => 'slategray',
unallocated => 'white',
variation => 'darkkhaki',
);
# pick text color based on background color
our %textColors = (
map({$_ => 'black'}
qw(cornflowerblue crimson cyan darkgray darkkhaki lightblue lightgray lightgreen orange orchid pink rosybrown salmon slategray white yellow)),
map({$_ => 'white'}
qw(black blue darkgreen purple sienna)),
);
# make sure every group color has a contrasting text color:
foreach (keys %legendColorByGroup) {
if(!exists($textColors{$legendColorByGroup{$_}})) {
die "Fatal data error: Need to define contrasting text color for \"$legendColorByGroup{$_}\" in \%textColors";
}
}
# warn of duplicate colors
my %legendGroupByColor = ();
foreach (keys %legendColorByGroup) {
$legendGroupByColor{$legendColorByGroup{$_}} .= "$_,";
}
foreach (sort keys %legendGroupByColor) {
@_ = split /,/, $legendGroupByColor{$_};
if(scalar @_ > 1) {
warn "Warning: Color \"$_\" is used by multiple groups: @_\n";
}
}
our $gradientDef;
our $gradientID;
sub stripes {
# This creates a "gradient" of distinct vertical stripes.
# Its arguments should be # the starting x-coordinate of the gradient,
# the ending x-coordinate, an SVG color, and then any number of stops.
# Each stop should be an array reference containing a stop # location
# (expressed as a number between 0 and 1) and a color.
# The subroutine returns a string you can set a stroke or fill
# attribute to to use the gradient.
my ($x1, $x2, $firstColor, @stops) = @_;
my $gradientElement =
$gradientDef->gradient(-type => 'linear',
gradientUnits => "userSpaceOnUse",
id => 'grad' . ++$gradientID,
x1 => $x1, x2 => $x2);
$gradientElement->stop(offset => '0%', 'stop-color' => $firstColor);
my $lastColor = $firstColor;
foreach (@stops) {
my $percent = 100*$_->[0] . '%';
$gradientElement->stop(offset => $percent, 'stop-color' => $lastColor);
$gradientElement->stop(offset => $percent, 'stop-color' => $_->[1]);
$lastColor = $_->[1];
}
$gradientElement->stop(offset => '100%', 'stop-color' => $lastColor);
return "url(#grad$gradientID)";
}
sub determineTextColor {
# Given the same arguments as &stripes, returns a value to use for the "fill"
# of text overlaying the given colors. This may be # a solid color instead of a gradient.
my ($x1, $x2, $firstColor, @stops) = @_;
my $lastTextColor = $textColors{$firstColor};
$firstColor = $lastTextColor;
for (my $n = 0 ; $n < @stops ; ++$n) {
my $thisTextColor = $textColors{$stops[$n][1]};
if ($thisTextColor eq $lastTextColor) {
# This stop is redundant, so we can remove it.
splice(@stops, $n, 1);
$n < @stops ? redo : last;
}
$stops[$n][1] = $thisTextColor;
$lastTextColor = $thisTextColor;
}
return (@stops
? stripes($x1, $x2, $firstColor, @stops)
: # We can just return a solid color.
$firstColor);
}
# create images for each desired plane
my @desiredPlanes;
if($opt_all) {
@desiredPlanes = sort { $a <=> $b } keys %dataByPlane;
} else {
@desiredPlanes = sort { $a <=> $b } keys %planeAcronymByNumber;
}
foreach my $thisPlane (@desiredPlanes) {
my @scripts = @{$legendByPlane{$thisPlane}};
my $dataString = $dataByPlane{$thisPlane};
if(($dataString =~ tr/\n//) == 0) {
# We need at least two lines so just duplicate the single line
$dataString .= "\n$dataString";
}
$gradientID = -1;
# Process $dataString
$dataString =~ s {\#.+} {}gm;
our @d = ();
{
foreach (split /\s*\n\s*/, $dataString) {
/\S/ or next;
/(.)(.)(.)\s+(.)\s+(.+)/;
die "Fatal data error: \"$5\" not found in \%legendColorByGroup\n"
unless defined($legendColorByGroup{$5});
push(@d, [hex($2), hex($3), hex($4), $legendColorByGroup{$5}, hex($1)]);
}
}
# Set up the SVG
my $sideLength = 550;
# Height and width of the roadmap square in pixels.
# The following sizes are expressed as fractions of
# $sideLength.
my $lineWidth = 1/250; # The width of the divider lines.
my $legendWidth = 7/10; # Width of the margin used for the legend (was 1/2 but was made bigger for various languages)
my $legendBoxSpace = 1/30; # Space between the rectangle for each legend and the right edge of the roadmap square.
my $legendTopMargin = 1/30; # Space between the first box of the legend and the top of the image.
my $legendBoxWidth = 1/20;
my $legendBoxHeightDefault = 2/77; # NOTE: This affects the legend text size too.
# $legendBoxHeightDefault was 1/40 (24/960) but was changed to this ridiculous fraction so that most languages will use 13.00px for font-size.
my $legendTextSpace = 1/50; # Space between the left edge of each legend box and its descriptive text.
my $legendLineBreak = 1/50;
$$_ *= $sideLength
foreach (\$lineWidth, \$legendWidth, \$legendBoxSpace,
\$legendBoxHeightDefault, \$legendTopMargin,
\$legendBoxWidth, \$legendTextSpace,
\$legendLineBreak);
our $svg = new SVG(-encoding => 'UTF-8',
-printerror => 1, # print processing errors to STDERR
width => $sideLength, # width without legend
height => $sideLength);
# create <title>
my $title;
if(defined($planeAcronymByNumber{$thisPlane})) {
$title = "Roadmap to the Unicode $planeAcronymByNumber{$thisPlane}";
} else {
$title = "Roadmap to Unicode Plane $thisPlane";
}
$svg->title->cdata($title);
# create <desc>
my($year, $month, $day);
($_,$_,$_,$day,$month,$year) = localtime(time);
my $desc = "This chart is a roadmap of character allocation for Unicode plane $thisPlane";
if(exists($planeNameByNumber{$thisPlane})) {
$desc .= ": $planeNameByNumber{$thisPlane}";
}
if(exists($planeAcronymByNumber{$thisPlane})) {
$desc .= " ($planeAcronymByNumber{$thisPlane})";
}
$desc .= sprintf ". It was created on %04d-%02d-%02d using Unicode $version data.", $year+1900, $month+1, $day;
if(scalar @scripts > 3) {
$desc .= " Characters are categorized more-or-less by the chapters in The Unicode Standard.";
}
$svg->desc->cdata($desc);
$gradientDef = $svg->defs;
# Declare this here to ensure that the gradient definitions appear in the file before anything else,
# especially the rectangles that reference them.
$svg->rectangle(x => 0,
y => 0,
width => $sideLength, # width without legend
height => $sideLength,
fill => 'white');
our $rectangleGroup = $svg->group('stroke-width' => ($lineWidth . 'px'),
'stroke' => 'gainsboro');
our $squareSideLength = ($sideLength - $lineWidth) / 16;
# determine font size for code point numbers
my $hexFontSize;
if($thisPlane == 16) {
# small font size to accommodate four digits
$hexFontSize = $squareSideLength/2.8;
} elsif($thisPlane == 0) {
# large font size for two digits
$hexFontSize = $squareSideLength/2.2;
} else {
# medium font size for three digits
$hexFontSize = $squareSideLength/2.4;
}
$hexFontSize = 0 + sprintf "%.1f", $hexFontSize; # round to one decimal place
# this defines each code point box
our $roadmapTextGroup = $svg->group('text-anchor' => 'middle',
'font-family' => $defaultFontFamily,
'font-size' => ($hexFontSize . 'px'));
# Draw the roadmap square
{
my $lastColorUsed = shift(@d)->[3]; # The last color we used.
my @nextStop = @{shift @d}; # The next stop (equivalent to one line of the $dataString).
foreach my $y (0 .. 15) {
foreach my $x (0 .. 15) {
# $y and $x correspond to the first and second digits, respectively, of each character's code point
my $xp = $lineWidth/2 + $x*$squareSideLength;
my $yp = $lineWidth/2 + $y*$squareSideLength;
my ($squareFill, $textFill);
my @stopsHere = ();
# Stops that occur in this square.
while (@nextStop and $nextStop[0] == $y and $nextStop[1] == $x) {
push(@stopsHere, [@nextStop]);
@nextStop = (@d ? @{shift @d} : ());
}
if (@stopsHere) {
$stopsHere[0][2] or $lastColorUsed = shift(@stopsHere)->[3];
my @args = ($xp,
$xp + $squareSideLength,
$lastColorUsed,
map {[ $_->[2]/16, $_->[3] ]} @stopsHere);
$squareFill = stripes(@args);
$textFill = determineTextColor(@args);
@stopsHere and $lastColorUsed = $stopsHere[-1][3];
} else {
$squareFill = $lastColorUsed;
$textFill = $textColors{$squareFill};
}
$rectangleGroup->rectangle(x => $xp, y => $yp,
width => $squareSideLength,
height => $squareSideLength,
fill => $squareFill);
$roadmapTextGroup->text(x => ($xp + $squareSideLength/2),
y => ($yp + (2/3)*$squareSideLength),
fill => $textFill
)->cdata(sprintf('%.0X%X%X', $thisPlane, $y, $x));
}
}
}
# Create output without legend
my $xmlOutput = $svg->xmlify;
if(!$opt_test) {
writeFile($xmlOutput, $thisPlane, "no legend");
}
# Draw the legend
{
my $legendRectanglesGroup = $svg->group('stroke-width' => $lineWidth . 'px',
stroke => 'black');
my $legendRectanglesSwitch = $legendRectanglesGroup->switch('allowReorder' => 'yes');
$legendBoxHeightDefault = 0 + sprintf "%.1f", $legendBoxHeightDefault; # round to one decimal digit
my $legendWordsGroup = $svg->group('text-anchor' => 'start',
'font-family' => $defaultFontFamily,
'font-size' => $legendBoxHeightDefault . 'px');
my $legendWordsSwitch = $legendWordsGroup->switch('allowReorder' => 'yes');
foreach my $thisLanguage (sort customLanguageSort keys %legendTextByLanguageAndGroup) {
my $x;
if(exists($isRTL{$thisLanguage})) {
$x = ceil($sideLength + $legendWidth - $legendBoxSpace - $legendBoxWidth - $lineWidth / 2);
} else {
$x = ceil($sideLength + $legendBoxSpace + $lineWidth / 2);
}
my $y = floor($legendTopMargin + $lineWidth / 2);
# Create the group for the little color boxes in the legend for this specific language
my $legendRectanglesThisLanguage = $legendRectanglesSwitch->group();
# Create the group for the words in the legend for this specific language
my $legendWordsThisLanguage = $legendWordsSwitch->group();
# Override the default font if necessary
if(defined($fontFamilyOverride{$thisLanguage})) {
$legendWordsThisLanguage->setAttribute('font-family' => $fontFamilyOverride{$thisLanguage});
}
# Make text larger for some languages because they have more space:
my $legendBoxHeight = $legendBoxHeightDefault;
if($thisLanguage eq 'ko' || $opt_test eq 'ko') {
# Korean
$legendBoxHeight = 0 + sprintf "%.1f", $legendBoxHeightDefault * 1.1;
$legendWordsThisLanguage->setAttribute('font-size' => $legendBoxHeight);
} elsif($thisLanguage eq 'fa' || $opt_test eq 'fa') {
# Persian
$legendBoxHeight = 0 + sprintf "%.1f", $legendBoxHeightDefault * 1.2;
$legendWordsThisLanguage->setAttribute('font-size' => $legendBoxHeight);
}
if($thisLanguage eq 'default') {
# Add default xml:lang language
$legendWordsThisLanguage->setAttribute('xml:lang' => $defaultLanguage);
} else {
# Add systemLanguage to both legend groups (used to choose which language to show)
$legendRectanglesThisLanguage->setAttribute(systemLanguage => $thisLanguage);
$legendWordsThisLanguage->setAttribute(systemLanguage => $thisLanguage);
# Add xml:lang language (not used to choose language but can affect font choice)
my $xmlLang = $thisLanguage;
$xmlLang =~ s/,.*$//; # xml:lang only handles one language so use the first in the list
$legendWordsThisLanguage->setAttribute('xml:lang' => $xmlLang);
if(exists($isRTL{$thisLanguage})) {
# Set anchor for words (reversed for RTL)
$legendWordsThisLanguage->setAttribute('text-anchor' => 'end');
}
}
foreach my $thisGroup (@scripts) {
if($thisGroup ne "asOfVersion") {
$legendRectanglesThisLanguage->rectangle(x => $x,
y => $y,
width => $legendBoxWidth,
height => $legendBoxHeight,
fill => $legendColorByGroup{$thisGroup});
}
$y += $legendLineBreak;
my @txt = split /\n/, $legendTextByLanguageAndGroup{$thisLanguage}{$thisGroup};
foreach my $thisLine (@txt) {
if($thisGroup eq "asOfVersion") {
$x -= $legendBoxWidth;
$y += $legendLineBreak;
$thisLine = sprintf $legendTextByLanguageAndGroup{$thisLanguage}{$thisGroup}, $version;
if($thisLanguage eq 'fa' || $opt_test eq 'fa') {
$thisLine =~ tr/0-9./۰-۹٫/; # switch to Persian numerals
}
}
if(exists($isRTL{$thisLanguage})) {
$legendWordsThisLanguage->text(x => ($x - int($legendBoxWidth / 3)),
y => $y
)->cdata($thisLine);
} else {
$legendWordsThisLanguage->text(x => ($x + $legendBoxWidth + $legendTextSpace),
y => $y
)->cdata($thisLine);
}
$y += (5/4)* sprintf "%.2f", $legendBoxHeight;
}
}
}
}
# Create output with legend
$xmlOutput = $svg->xmlify;
my $newWidth = $sideLength + $legendWidth; # adjust width to include the legend
$xmlOutput =~ s/width="$sideLength"/width="$newWidth"/g;
if($opt_test) {
# Add dashed lines to the legend to make it easier to determine if the text is going to be truncated
$xmlOutput =~ s/(<\/svg>)/<line x1=\"$newWidth\" y1=\"0\" x2=\"$newWidth\" y2=\"$sideLength\" style=\"stroke:rgb(0,0,0);stroke-width:2;stroke-dasharray:5,5\" \/><line x1=\"$sideLength\" y1=\"$sideLength\" x2=\"$newWidth\" y2=\"$sideLength\" style=\"stroke:rgb(0,0,0);stroke-width:2;stroke-dasharray:5,5\" \/>$1/;
}
writeFile($xmlOutput, $thisPlane, "multilingual");
# Create monolingual English version
$xmlOutput = $svg->xmlify;
$newWidth = $sideLength * 1.5; # adjust width to include the smaller legend
$xmlOutput =~ s/width="$sideLength"/width="$newWidth"/g;
# this is a hack... ideally it should alter the groups themselves using the SVG package but regex is easier to get working (and easier to break)
$xmlOutput =~ s/\s*<g[^>]+?systemLanguage.*?<\/g>//gms; # delete all groups with systemLanguage
if($xmlOutput =~ m/systemLanguage/) {
warn "Warning: systemLanguage still present, monolingual versions probably wrong\n";
} else {
$xmlOutput =~ s/\s*<switch\s*[^>]*?>\s*//gms; # delete all <switch>s
$xmlOutput =~ s/\s*<\/switch\s*>\s*//gms; # delete all </switch>s
if($xmlOutput =~ m/<switch/) {
warn "Warning: switch still present, monolingual versions probably wrong\n";
} else {
# condense emtpy groups
$xmlOutput =~ s/(<g [^>]+?>)<g\s*>/$1/gms;
$xmlOutput =~ s/(<g [^>]+?)><g( xml:lang="..+?")>/$1$2>/gms;
$xmlOutput =~ s/(<\/g>)<\/g>/$1/gms;
# tidy up
$xmlOutput =~ s/\t\t\t\t/\t\t/g;
$xmlOutput =~ s/\t\t\t/\t\t/g;
$xmlOutput =~ s/\t(\t<\/g>)/$1/g;
$xmlOutput =~ s/(<\/g>)(<g)/$1\n\t$2/g;
$xmlOutput =~ s/(<\/g>)(<\/svg)/$1\n$2/g;
}
}
if(!$opt_test) {
writeFile($xmlOutput, $thisPlane, "en");
}
} # End of plane loop
sub writeFile {
my $txt = $_[0];
my $planeNumber = $_[1];
my $suffix = $_[2];
# Fix-ups: # Remove extra space in <text> elements. Inkscape ignores it, but
# librsvg treats it like a normal character, thus messing up text alignment.
$txt =~ s{\s+</text>\s} {</text>\n}g;
# Do the same for other elements as well
$txt =~ s{\s+</g>\s} {</g>\n};
$txt =~ s{\s+</title>\s} {</title>\n};
$txt =~ s{\s+</switch>\s} {</switch>\n};
# Get rid of space after elements with no attributes
$txt =~ s{<([A-Za-z]+)\s+>} {<$1>}g;
# Change to Unix-style newlines if necessary.
$txt =~ s{\015\012?} {\012}g;
# Remove comment containing non-existant www.roitsystems.com URL
$txt =~ s/(<!--[^>]*?www.roitsystems.com.*?-->)//gsm;
# Remove blank lines
$txt =~ s/^\s*\n+//mg;
# Fix odd problem with 0x80-0xff
# not sure why but 0x80-0xff get turned into " & #nnn;" where nnn is the decimal character value
$txt =~ s/ & #([0-9]+);/chr($1)/ge;
# Determine file name
my $filename;
if(defined($planeAcronymByNumber{$planeNumber})) {
$filename = "map $planeAcronymByNumber{$planeNumber} $suffix.svg";
} else {
$filename = sprintf "map Z%.2i $suffix.svg", $planeNumber;
}
$filename =~ s/ /_/g; # replace spaces with underscores
# write the file
open(OUT, "> $outputDirectory/$filename") ||
die("Can't open $outputDirectory/$filename\n");
binmode(OUT, ":utf8");
print OUT $txt;
close(OUT);
}
sub customLanguageSort
{
# Always put the default last
if($a eq "default") {
return 1;
}
if($b eq "default") {
return -1;
}
# otherwise sort alphabetically without regard to case
return lc($a) cmp lc($b);
}
# end of script