用js解析html字符串
目標用js將html字符串解析為一個類似于虛擬dom的對象
const htmlStr = `<html>
<head></head>
<body>
<h1>我是標簽</h1>
<div>我是div標簽</div>
<span id="root" style="color:red">我是span標簽</span>
</body>
</html>`;
htmlTransform(htmlStr);
// 期望結果格式:
// { nodeName: 'html', children: [ ...,{ nodeName: 'body', id: 'xxxx', }, .... ] }
開發(fā)的htmlstr-parser-n插件
npm i htmlstr-parser-n 使用
const { htmlObjParser, htmlStrParser } = require("html-parser-n");
const fs = require("fs");
fs.writeFileSync('./demo.json', JSON.stringify(
htmlStrParser(`
<html>
<body>
<span id="root" style="color:red;">我是span標簽</span>
</body>
</html>
`)
))
console.log(htmlObjParser(require("./demo.json")))
實現(xiàn)原理
- 狀態(tài)機記錄執(zhí)行狀態(tài)
let sign_enum = {
SIGN_END: "SIGN_END", // 結束標簽讀取 如 </xxxxx>
SIGN_END_OK: "SIGN_EN_OK", // 結束標簽讀取完成
SIGN_START: "SIGN_START", // 開始標簽讀取 如 <xxxxx>
SIGN_START_OK: "SIGN_START_OK", // 開始標簽讀取完成
};
- 字符串輪訓讀取,根據(jù)特殊符號< 、</、>來標注狀態(tài)
- 標記每次讀取的內容 sign
- 用淺拷貝來標記每次操作的節(jié)點
完整代碼
let sign_enum = {
SIGN_END: "SIGN_END", // 結束標簽讀取 如 </xxxxx>
SIGN_END_OK: "SIGN_EN_OK", // 結束標簽讀取完成
SIGN_START: "SIGN_START", // 開始標簽讀取 如 <xxxxx>
SIGN_START_OK: "SIGN_START_OK", // 開始標簽讀取完成
};
function htmlStrParser(htmlStr) {
const str = htmlStr.replace(/\n/g, "");
let result = { nodeName: "root", children: [] };
// 默認 result.children[0]插入, ,這里記錄調試用的棧信息
let use_line = [0];
let current_index = 0; // 記錄當前插入children的下標
let node = result; // 當前操作的節(jié)點
let sign = ""; // 標記標簽字符串(可能包含屬性字符)、文本信息
let status = ""; // 當前狀態(tài),為空的時候我們認為是在讀取當前節(jié)點(node)的文本信息
for (var i = 0; i < str.length; i++) {
var current = str.charAt(i);
var next = str.charAt(i + 1);
if (current === "<") {
// 在開始標簽完成后記錄文本信息到當前節(jié)點
if (sign && status === sign_enum.SIGN_START_OK) {
node.text = sign;
sign = "";
}
// 根據(jù)“</”來區(qū)分是 結束標簽的(</xxx>)讀取中 還是開始的標簽(<xxx>) 讀取中
if (next === "/") {
status = sign_enum.SIGN_END;
} else {
status = sign_enum.SIGN_START;
}
} else if (current === ">") {
// (<xxx>) 讀取中,遇到“>”, (<xxx>) 讀取中完成
if (status === sign_enum.SIGN_START) {
// 記錄當前node所在的位置,并更改node
node = result;
use_line.map((_, index) => {
if (!node.children) node.children = [];
if (index === use_line.length - 1) {
sign = sign.replace(/^\s*/g, "").replace(/\"/g, "");
let mark = sign.match(/^[a-zA-Z0-9]*\s*/)[0].replace(/\s/g, ""); // 記錄標簽
// 標簽上定義的屬性獲取
let attributeStr = sign.replace(mark, '').replace(/\s+/g, ",").split(",");
let attrbuteObj = {};
let style = {};
attributeStr.map(attr => {
if (attr) {
let value = attr.split("=")[1];
let key = attr.split("=")[0];
if (key === "style") {
value.split(";").map(s => {
if (s) {
style[s.split(":")[0]] = s.split(":")[1]
}
})
return attrbuteObj[key] = style;
}
attrbuteObj[key] = value;
}
})
node.children.push({ nodeName: mark, children: [], ...attrbuteObj })
}
current_index = node.children.length - 1;
node = node.children[current_index];
});
use_line.push(current_index);
sign = "";
status = sign_enum.SIGN_START_OK;
}
// (</xxx>) 讀取中,遇到“>”, (</xxx>) 讀取中完成
if (status === sign_enum.SIGN_END) {
use_line.pop();
node = result;
// 重新尋找操作的node
use_line.map((i) => {
node = node.children[i];
});
sign = "";
status = sign_enum.SIGN_END_OK;
}
} else {
sign = sign + current;
}
}
return result;
}
console.dir(htmlStrParser(htmlStr))
fs.writeFileSync("htmlObj.text", JSON.stringify(htmlStrParser(htmlStr)))
格式化查看
{
"nodeName":"root",
"children":[
{
"nodeName":"html",
"children":[
{
"nodeName":"head",
"children":[]
},
{
"nodeName":"body",
"children":[
{
"nodeName":"h1",
"children":[],
"text":"我是標簽"
},
{
"nodeName":"div",
"children":[],
"text":"我是div標簽"
},
{
"nodeName":"span",
"children":[],
"id":"root",
"style":{
"color":"red"
},
"text":"我是span標簽"
}
],
"text":" "
}
]
}
]
}
用js解析html對象
實現(xiàn)html的增刪查改可以先轉成對象數(shù)組的形式,然后操作對象數(shù)組,操作完成后再轉成字符串
function htmlObjParser(obj) {
let htmlStr = "";
function work(obj) {
const children = obj.children;
let attrStr = "";
Object.keys(obj).map(key => {
if (key !== 'nodeName' && key !== 'text' && key !== "children") {
if (key !== 'style') {
attrStr += ` ${key}=${obj[key]}`
} else if (key === 'style') {
let styleStr = '';
Object.keys(obj[key]).map(k => {
styleStr += ` ${k}:${obj[key][k]};`
})
attrStr += styleStr;
}
}
})
htmlStr += `<${obj.nodeName}${attrStr}>${obj.text ? obj.text : ''}`;
if (children && children.length) {
children.map(c => {
work(c)
});
}
htmlStr += `</${obj.nodeName}>`;
}
work(obj);
return htmlStr;
}
htmlObjParser(require("demo.text"))