Unicode字符, utf8編碼, 展示小工具

Unicode

www.unicode.org/Public/UNIDATA/Blocks.txt

Utf-8編碼規(guī)則

image.png

Unicode 和 UTF-8 有什么區(qū)別? - 知乎 (zhihu.com)

代碼


package com.vege;

import org.apache.tomcat.util.buf.HexUtils;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;

public class UnicodeToUtf8 {

    // 字符和十六進(jìn)制數(shù)的映射
    private static final Map<Character, Integer> map = new HashMap<>();

    static {
        map.put('0', 0);
        map.put('1', 1);
        map.put('2', 2);
        map.put('3', 3);
        map.put('4', 4);
        map.put('5', 5);
        map.put('6', 6);
        map.put('7', 7);
        map.put('8', 8);
        map.put('9', 9);
        map.put('A', 10);
        map.put('B', 11);
        map.put('C', 12);
        map.put('D', 13);
        map.put('E', 14);
        map.put('F', 15);
    }

    public static void main(String[] args) throws IOException {

        // 打印unicode中, "0x4DC0"對(duì)應(yīng)的utf8編碼的字符
        String temp = unicodeToUtf8Str(hexStrToInt("4DC0"));
        System.out.println("0x4DC0 - " + temp);
        System.out.println();

        // ======================================================

        // 打印從from到to的所有的unicode字符
        // 每16個(gè)一行
        int from = 0x4DC0;
        int to = 0x4DFF;
        for (int i = from, j = 0; i < to; i++, j++) {
            if (j % 16 == 0) {
                System.out.println();
                String start = "0x" + HexUtils.toHexString(intToByteArrayHighFirst(i)).substring(2);
                String end = start.substring(0, start.length() - 1) + "f";
                System.out.println("-> " + start + " - " + end + " : ");
            }

            System.out.print("\t");
            System.out.print(unicodeToUtf8Str(i));
        }

        System.out.println();
    }

    /**
     * 字符串形式的十六進(jìn)制數(shù)轉(zhuǎn)成int
     * eg. "4DC0" -> 19904
     *
     * @param str 字符串形式的十六進(jìn)制數(shù)
     * @return 對(duì)應(yīng)的整型數(shù)據(jù)
     */
    private static int hexStrToInt(String str) {
        int result = 0;
        for (char c : str.toCharArray()) {
            if (!map.containsKey(c)) {
                throw new RuntimeException("invalid char");
            }
            result = (result << 4) | map.get(c);
        }
        return result;
    }

    /**
     * 攜帶有unicode編碼的信息整型數(shù)據(jù) (int有32bit,unicode只用到低位24個(gè)bit)
     * 轉(zhuǎn)成對(duì)應(yīng)的utf-8的字符
     * <p>
     * 注意這里僅傳入單個(gè)unicode編碼, 傳出單個(gè)utf8編碼的字符
     *
     * @param unicode 單個(gè)unicode編碼
     * @return 單個(gè)utf8編碼的字符
     */
    private static String unicodeToUtf8Str(int unicode) {
        if (unicode >= 0 && unicode <= 0x7F) {
            byte b = (byte) unicode;
            byte[] bytes = new byte[1];
            bytes[0] = b;
            return new String(bytes, StandardCharsets.UTF_8);
        } else if (unicode > 0x7F && unicode <= 0x7FF) {
            int temp = unicode & 0b11111111111;
            byte b1 = (byte) (0b11000000 | (temp >> 6));
            byte b2 = (byte) (0b10000000 | (temp & 0b111111));
            byte[] bytes = new byte[2];
            bytes[0] = b1;
            bytes[1] = b2;
            return new String(bytes, StandardCharsets.UTF_8);
        } else if (unicode > 0x7FF && unicode <= 0xFFFF) {
            int temp = unicode & 0b1111111111111111;
            byte b1 = (byte) (0b11100000 | (temp >> 12));
            byte b2 = (byte) (0b10000000 | ((temp & 0b111111000000) >> 6));
            byte b3 = (byte) (0b10000000 | (temp & 0b111111));
            byte[] bytes = new byte[3];
            bytes[0] = b1;
            bytes[1] = b2;
            bytes[2] = b3;
            return new String(bytes, StandardCharsets.UTF_8);
        } else if (unicode > 0x10000 && unicode <= 0x10FFFF) {
            int temp = unicode & 0b111111111111111111111;
            byte b1 = (byte) (0b11110000 | (temp >> 18));
            byte b2 = (byte) (0b10000000 | ((temp & 0b111111000000000000) >> 12));
            byte b3 = (byte) (0b10000000 | ((temp & 0b111111000000) >> 6));
            byte b4 = (byte) (0b10000000 | (temp & 0b111111));
            byte[] bytes = new byte[4];
            bytes[0] = b1;
            bytes[1] = b2;
            bytes[2] = b3;
            bytes[3] = b4;
            return new String(bytes, StandardCharsets.UTF_8);
        } else {
            throw new RuntimeException("error unicode");
        }
    }

    /**
     * int轉(zhuǎn)byte數(shù)組, int高位的部分存在數(shù)組低位的槽中
     * eg. 19904 -> new byte[] {0b00000000, 0b00000000, 0b01001101, 0b11000000}
     *
     * @param a 整型數(shù)據(jù)
     * @return byte數(shù)組
     */
    private static byte[] intToByteArrayHighFirst(int a) {
        byte[] b = new byte[4];
        b[3] = (byte) (a & 0xff);
        b[2] = (byte) (a >> 8 & 0xff);
        b[1] = (byte) (a >> 16 & 0xff);
        b[0] = (byte) (a >> 24 & 0xff);
        return b;
    }

}

結(jié)果

image.png
最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

友情鏈接更多精彩內(nèi)容