XML
Character Encoding - XML
Character encoding is a rule system for converting characters and symbols into byte sequences that computers can process. Various schemes exist, from international standards like ASCII and UTF-8, to Japanese-specific encodings like Shift_JIS and EUC-JP, to country-specific code pages. While UTF-8 based on Unicode is now widely adopted as the global standard, understanding various encoding schemes remains important for maintaining compatibility with legacy systems.
character code
Unicode
UTF-8
character set
internationalization
text processing
<?xml version="1.0" encoding="UTF-8"?>
<items>
<item>
<code>utf-8</code>
<slug>utf-8</slug>
<name>UTF-8</name>
<description>A variable-length character encoding that represents Unicode using 1 to 4 bytes.</description>
<category>Unicode</category>
<ianaName>UTF-8</ianaName>
<mibEnum>106</mibEnum>
</item>
<item>
<code>utf-16</code>
<slug>utf-16</slug>
<name>UTF-16</name>
<description>A character encoding that represents Unicode in 16-bit units.</description>
<category>Unicode</category>
<ianaName>UTF-16</ianaName>
<mibEnum>1015</mibEnum>
</item>
<item>
<code>utf-32</code>
<slug>utf-32</slug>
<name>UTF-32</name>
<description>A character encoding that represents Unicode in fixed-length 32 bits (4 bytes).</description>
<category>Unicode</category>
<ianaName>UTF-32</ianaName>
<mibEnum>1017</mibEnum>
</item>
<item>
<code>us-ascii</code>
<slug>us-ascii</slug>
<name>US-ASCII</name>
<description>A basic character encoding that defines 128 characters in 7 bits.</description>
<category>ASCII</category>
<ianaName>US-ASCII</ianaName>
<mibEnum>3</mibEnum>
</item>
<item>
<code>iso-8859-1</code>
<slug>iso-8859-1</slug>
<name>ISO-8859-1 (Latin-1)</name>
<description>An 8-bit character encoding for Western European languages.</description>
<category>ISO-8859</category>
<ianaName>ISO-8859-1</ianaName>
<mibEnum>4</mibEnum>
</item>
<item>
<code>iso-8859-2</code>
<slug>iso-8859-2</slug>
<name>ISO-8859-2 (Latin-2)</name>
<description>An 8-bit character encoding for Central European languages.</description>
<category>ISO-8859</category>
<ianaName>ISO-8859-2</ianaName>
<mibEnum>5</mibEnum>
</item>
<item>
<code>iso-8859-5</code>
<slug>iso-8859-5</slug>
<name>ISO-8859-5 (Cyrillic)</name>
<description>An 8-bit character encoding for Cyrillic script.</description>
<category>ISO-8859</category>
<ianaName>ISO-8859-5</ianaName>
<mibEnum>8</mibEnum>
</item>
<item>
<code>iso-8859-7</code>
<slug>iso-8859-7</slug>
<name>ISO-8859-7 (Greek)</name>
<description>An 8-bit character encoding for Modern Greek.</description>
<category>ISO-8859</category>
<ianaName>ISO-8859-7</ianaName>
<mibEnum>10</mibEnum>
</item>
<item>
<code>iso-8859-15</code>
<slug>iso-8859-15</slug>
<name>ISO-8859-15 (Latin-9)</name>
<description>A revised version of ISO-8859-1 that includes the Euro sign.</description>
<category>ISO-8859</category>
<ianaName>ISO-8859-15</ianaName>
<mibEnum>111</mibEnum>
</item>
<item>
<code>shift_jis</code>
<slug>shift-jis</slug>
<name>Shift_JIS</name>
<description>A Japanese character encoding standardly used on Windows and Macintosh.</description>
<category>Japanese</category>
<ianaName>Shift_JIS</ianaName>
<mibEnum>17</mibEnum>
</item>
<item>
<code>euc-jp</code>
<slug>euc-jp</slug>
<name>EUC-JP</name>
<description>A Japanese character encoding used on Unix-like systems.</description>
<category>Japanese</category>
<ianaName>EUC-JP</ianaName>
<mibEnum>18</mibEnum>
</item>
<item>
<code>iso-2022-jp</code>
<slug>iso-2022-jp</slug>
<name>ISO-2022-JP</name>
<description>An encoding for Japanese email in 7-bit environments.</description>
<category>Japanese</category>
<ianaName>ISO-2022-JP</ianaName>
<mibEnum>39</mibEnum>
</item>
<item>
<code>gb2312</code>
<slug>gb2312</slug>
<name>GB2312</name>
<description>A basic character encoding for Simplified Chinese.</description>
<category>Chinese</category>
<ianaName>GB2312</ianaName>
<mibEnum>2025</mibEnum>
</item>
<item>
<code>gbk</code>
<slug>gbk</slug>
<name>GBK</name>
<description>A Chinese character encoding that extends GB2312.</description>
<category>Chinese</category>
<ianaName>GBK</ianaName>
<mibEnum>113</mibEnum>
</item>
<item>
<code>gb18030</code>
<slug>gb18030</slug>
<name>GB18030</name>
<description>China's current national standard, capable of representing all Unicode characters.</description>
<category>Chinese</category>
<ianaName>GB18030</ianaName>
<mibEnum>114</mibEnum>
</item>
<item>
<code>big5</code>
<slug>big5</slug>
<name>Big5</name>
<description>A Traditional Chinese character encoding used in Taiwan and Hong Kong.</description>
<category>Chinese</category>
<ianaName>Big5</ianaName>
<mibEnum>2026</mibEnum>
</item>
<item>
<code>euc-kr</code>
<slug>euc-kr</slug>
<name>EUC-KR</name>
<description>A Korean character encoding used on Unix-like systems.</description>
<category>Korean</category>
<ianaName>EUC-KR</ianaName>
<mibEnum>38</mibEnum>
</item>
<item>
<code>iso-2022-kr</code>
<slug>iso-2022-kr</slug>
<name>ISO-2022-KR</name>
<description>An encoding for Korean email in 7-bit environments.</description>
<category>Korean</category>
<ianaName>ISO-2022-KR</ianaName>
<mibEnum>37</mibEnum>
</item>
<item>
<code>koi8-r</code>
<slug>koi8-r</slug>
<name>KOI8-R</name>
<description>An 8-bit character encoding for Russian Cyrillic.</description>
<category>Cyrillic</category>
<ianaName>KOI8-R</ianaName>
<mibEnum>2084</mibEnum>
</item>
<item>
<code>koi8-u</code>
<slug>koi8-u</slug>
<name>KOI8-U</name>
<description>An 8-bit character encoding for Ukrainian Cyrillic.</description>
<category>Cyrillic</category>
<ianaName>KOI8-U</ianaName>
<mibEnum>2088</mibEnum>
</item>
<item>
<code>windows-1252</code>
<slug>windows-1252</slug>
<name>Windows-1252</name>
<description>An 8-bit encoding for Western European languages used on Microsoft Windows.</description>
<category>Windows Code Page</category>
<ianaName>windows-1252</ianaName>
<mibEnum>2252</mibEnum>
</item>
</items>