Unicode
www.unicode.org/Public/UNIDATA/Blocks.txt
Utf-8編碼規(guī)則

image.png
Unicode 和 UTF-8 有什么區(qū)別? - 知乎 (zhihu.com)
代碼
package com.vege;
import org.apache.tomcat.util.buf.HexUtils;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
public class UnicodeToUtf8 {
// 字符和十六進(jìn)制數(shù)的映射
private static final Map<Character, Integer> map = new HashMap<>();
static {
map.put('0', 0);
map.put('1', 1);
map.put('2', 2);
map.put('3', 3);
map.put('4', 4);
map.put('5', 5);
map.put('6', 6);
map.put('7', 7);
map.put('8', 8);
map.put('9', 9);
map.put('A', 10);
map.put('B', 11);
map.put('C', 12);
map.put('D', 13);
map.put('E', 14);
map.put('F', 15);
}
public static void main(String[] args) throws IOException {
// 打印unicode中, "0x4DC0"對(duì)應(yīng)的utf8編碼的字符
String temp = unicodeToUtf8Str(hexStrToInt("4DC0"));
System.out.println("0x4DC0 - " + temp);
System.out.println();
// ======================================================
// 打印從from到to的所有的unicode字符
// 每16個(gè)一行
int from = 0x4DC0;
int to = 0x4DFF;
for (int i = from, j = 0; i < to; i++, j++) {
if (j % 16 == 0) {
System.out.println();
String start = "0x" + HexUtils.toHexString(intToByteArrayHighFirst(i)).substring(2);
String end = start.substring(0, start.length() - 1) + "f";
System.out.println("-> " + start + " - " + end + " : ");
}
System.out.print("\t");
System.out.print(unicodeToUtf8Str(i));
}
System.out.println();
}
/**
* 字符串形式的十六進(jìn)制數(shù)轉(zhuǎn)成int
* eg. "4DC0" -> 19904
*
* @param str 字符串形式的十六進(jìn)制數(shù)
* @return 對(duì)應(yīng)的整型數(shù)據(jù)
*/
private static int hexStrToInt(String str) {
int result = 0;
for (char c : str.toCharArray()) {
if (!map.containsKey(c)) {
throw new RuntimeException("invalid char");
}
result = (result << 4) | map.get(c);
}
return result;
}
/**
* 攜帶有unicode編碼的信息整型數(shù)據(jù) (int有32bit,unicode只用到低位24個(gè)bit)
* 轉(zhuǎn)成對(duì)應(yīng)的utf-8的字符
* <p>
* 注意這里僅傳入單個(gè)unicode編碼, 傳出單個(gè)utf8編碼的字符
*
* @param unicode 單個(gè)unicode編碼
* @return 單個(gè)utf8編碼的字符
*/
private static String unicodeToUtf8Str(int unicode) {
if (unicode >= 0 && unicode <= 0x7F) {
byte b = (byte) unicode;
byte[] bytes = new byte[1];
bytes[0] = b;
return new String(bytes, StandardCharsets.UTF_8);
} else if (unicode > 0x7F && unicode <= 0x7FF) {
int temp = unicode & 0b11111111111;
byte b1 = (byte) (0b11000000 | (temp >> 6));
byte b2 = (byte) (0b10000000 | (temp & 0b111111));
byte[] bytes = new byte[2];
bytes[0] = b1;
bytes[1] = b2;
return new String(bytes, StandardCharsets.UTF_8);
} else if (unicode > 0x7FF && unicode <= 0xFFFF) {
int temp = unicode & 0b1111111111111111;
byte b1 = (byte) (0b11100000 | (temp >> 12));
byte b2 = (byte) (0b10000000 | ((temp & 0b111111000000) >> 6));
byte b3 = (byte) (0b10000000 | (temp & 0b111111));
byte[] bytes = new byte[3];
bytes[0] = b1;
bytes[1] = b2;
bytes[2] = b3;
return new String(bytes, StandardCharsets.UTF_8);
} else if (unicode > 0x10000 && unicode <= 0x10FFFF) {
int temp = unicode & 0b111111111111111111111;
byte b1 = (byte) (0b11110000 | (temp >> 18));
byte b2 = (byte) (0b10000000 | ((temp & 0b111111000000000000) >> 12));
byte b3 = (byte) (0b10000000 | ((temp & 0b111111000000) >> 6));
byte b4 = (byte) (0b10000000 | (temp & 0b111111));
byte[] bytes = new byte[4];
bytes[0] = b1;
bytes[1] = b2;
bytes[2] = b3;
bytes[3] = b4;
return new String(bytes, StandardCharsets.UTF_8);
} else {
throw new RuntimeException("error unicode");
}
}
/**
* int轉(zhuǎn)byte數(shù)組, int高位的部分存在數(shù)組低位的槽中
* eg. 19904 -> new byte[] {0b00000000, 0b00000000, 0b01001101, 0b11000000}
*
* @param a 整型數(shù)據(jù)
* @return byte數(shù)組
*/
private static byte[] intToByteArrayHighFirst(int a) {
byte[] b = new byte[4];
b[3] = (byte) (a & 0xff);
b[2] = (byte) (a >> 8 & 0xff);
b[1] = (byte) (a >> 16 & 0xff);
b[0] = (byte) (a >> 24 & 0xff);
return b;
}
}
結(jié)果

image.png