XML

Character Encoding - XML

Character encoding is a rule system for converting characters and symbols into byte sequences that computers can process. Various schemes exist, from international standards like ASCII and UTF-8, to Japanese-specific encodings like Shift_JIS and EUC-JP, to country-specific code pages. While UTF-8 based on Unicode is now widely adopted as the global standard, understanding various encoding schemes remains important for maintaining compatibility with legacy systems.

character code Unicode UTF-8 character set internationalization text processing
<?xml version="1.0" encoding="UTF-8"?>
<items>
  <item>
    <code>utf-8</code>
    <slug>utf-8</slug>
    <name>UTF-8</name>
    <description>A variable-length character encoding that represents Unicode using 1 to 4 bytes.</description>
    <category>Unicode</category>
    <ianaName>UTF-8</ianaName>
    <mibEnum>106</mibEnum>
  </item>
  <item>
    <code>utf-16</code>
    <slug>utf-16</slug>
    <name>UTF-16</name>
    <description>A character encoding that represents Unicode in 16-bit units.</description>
    <category>Unicode</category>
    <ianaName>UTF-16</ianaName>
    <mibEnum>1015</mibEnum>
  </item>
  <item>
    <code>utf-32</code>
    <slug>utf-32</slug>
    <name>UTF-32</name>
    <description>A character encoding that represents Unicode in fixed-length 32 bits (4 bytes).</description>
    <category>Unicode</category>
    <ianaName>UTF-32</ianaName>
    <mibEnum>1017</mibEnum>
  </item>
  <item>
    <code>us-ascii</code>
    <slug>us-ascii</slug>
    <name>US-ASCII</name>
    <description>A basic character encoding that defines 128 characters in 7 bits.</description>
    <category>ASCII</category>
    <ianaName>US-ASCII</ianaName>
    <mibEnum>3</mibEnum>
  </item>
  <item>
    <code>iso-8859-1</code>
    <slug>iso-8859-1</slug>
    <name>ISO-8859-1 (Latin-1)</name>
    <description>An 8-bit character encoding for Western European languages.</description>
    <category>ISO-8859</category>
    <ianaName>ISO-8859-1</ianaName>
    <mibEnum>4</mibEnum>
  </item>
  <item>
    <code>iso-8859-2</code>
    <slug>iso-8859-2</slug>
    <name>ISO-8859-2 (Latin-2)</name>
    <description>An 8-bit character encoding for Central European languages.</description>
    <category>ISO-8859</category>
    <ianaName>ISO-8859-2</ianaName>
    <mibEnum>5</mibEnum>
  </item>
  <item>
    <code>iso-8859-5</code>
    <slug>iso-8859-5</slug>
    <name>ISO-8859-5 (Cyrillic)</name>
    <description>An 8-bit character encoding for Cyrillic script.</description>
    <category>ISO-8859</category>
    <ianaName>ISO-8859-5</ianaName>
    <mibEnum>8</mibEnum>
  </item>
  <item>
    <code>iso-8859-7</code>
    <slug>iso-8859-7</slug>
    <name>ISO-8859-7 (Greek)</name>
    <description>An 8-bit character encoding for Modern Greek.</description>
    <category>ISO-8859</category>
    <ianaName>ISO-8859-7</ianaName>
    <mibEnum>10</mibEnum>
  </item>
  <item>
    <code>iso-8859-15</code>
    <slug>iso-8859-15</slug>
    <name>ISO-8859-15 (Latin-9)</name>
    <description>A revised version of ISO-8859-1 that includes the Euro sign.</description>
    <category>ISO-8859</category>
    <ianaName>ISO-8859-15</ianaName>
    <mibEnum>111</mibEnum>
  </item>
  <item>
    <code>shift_jis</code>
    <slug>shift-jis</slug>
    <name>Shift_JIS</name>
    <description>A Japanese character encoding standardly used on Windows and Macintosh.</description>
    <category>Japanese</category>
    <ianaName>Shift_JIS</ianaName>
    <mibEnum>17</mibEnum>
  </item>
  <item>
    <code>euc-jp</code>
    <slug>euc-jp</slug>
    <name>EUC-JP</name>
    <description>A Japanese character encoding used on Unix-like systems.</description>
    <category>Japanese</category>
    <ianaName>EUC-JP</ianaName>
    <mibEnum>18</mibEnum>
  </item>
  <item>
    <code>iso-2022-jp</code>
    <slug>iso-2022-jp</slug>
    <name>ISO-2022-JP</name>
    <description>An encoding for Japanese email in 7-bit environments.</description>
    <category>Japanese</category>
    <ianaName>ISO-2022-JP</ianaName>
    <mibEnum>39</mibEnum>
  </item>
  <item>
    <code>gb2312</code>
    <slug>gb2312</slug>
    <name>GB2312</name>
    <description>A basic character encoding for Simplified Chinese.</description>
    <category>Chinese</category>
    <ianaName>GB2312</ianaName>
    <mibEnum>2025</mibEnum>
  </item>
  <item>
    <code>gbk</code>
    <slug>gbk</slug>
    <name>GBK</name>
    <description>A Chinese character encoding that extends GB2312.</description>
    <category>Chinese</category>
    <ianaName>GBK</ianaName>
    <mibEnum>113</mibEnum>
  </item>
  <item>
    <code>gb18030</code>
    <slug>gb18030</slug>
    <name>GB18030</name>
    <description>China&apos;s current national standard, capable of representing all Unicode characters.</description>
    <category>Chinese</category>
    <ianaName>GB18030</ianaName>
    <mibEnum>114</mibEnum>
  </item>
  <item>
    <code>big5</code>
    <slug>big5</slug>
    <name>Big5</name>
    <description>A Traditional Chinese character encoding used in Taiwan and Hong Kong.</description>
    <category>Chinese</category>
    <ianaName>Big5</ianaName>
    <mibEnum>2026</mibEnum>
  </item>
  <item>
    <code>euc-kr</code>
    <slug>euc-kr</slug>
    <name>EUC-KR</name>
    <description>A Korean character encoding used on Unix-like systems.</description>
    <category>Korean</category>
    <ianaName>EUC-KR</ianaName>
    <mibEnum>38</mibEnum>
  </item>
  <item>
    <code>iso-2022-kr</code>
    <slug>iso-2022-kr</slug>
    <name>ISO-2022-KR</name>
    <description>An encoding for Korean email in 7-bit environments.</description>
    <category>Korean</category>
    <ianaName>ISO-2022-KR</ianaName>
    <mibEnum>37</mibEnum>
  </item>
  <item>
    <code>koi8-r</code>
    <slug>koi8-r</slug>
    <name>KOI8-R</name>
    <description>An 8-bit character encoding for Russian Cyrillic.</description>
    <category>Cyrillic</category>
    <ianaName>KOI8-R</ianaName>
    <mibEnum>2084</mibEnum>
  </item>
  <item>
    <code>koi8-u</code>
    <slug>koi8-u</slug>
    <name>KOI8-U</name>
    <description>An 8-bit character encoding for Ukrainian Cyrillic.</description>
    <category>Cyrillic</category>
    <ianaName>KOI8-U</ianaName>
    <mibEnum>2088</mibEnum>
  </item>
  <item>
    <code>windows-1252</code>
    <slug>windows-1252</slug>
    <name>Windows-1252</name>
    <description>An 8-bit encoding for Western European languages used on Microsoft Windows.</description>
    <category>Windows Code Page</category>
    <ianaName>windows-1252</ianaName>
    <mibEnum>2252</mibEnum>
  </item>
</items>