這篇文章給大家分享的是有關(guān)基于xpath選擇器、PyQuery、正則表達(dá)式格式清理工具的示例分析的內(nèi)容。小編覺(jué)得挺實(shí)用的,因此分享給大家做個(gè)參考,一起跟隨小編過(guò)來(lái)看看吧。

1,使用xpath清理不必要的標(biāo)簽元素,以及無(wú)內(nèi)容標(biāo)簽
from lxml import etree
def xpath_clean(self, text: str, xpath_dict: dict) -> str:
'''
xpath 清除不必要的元素
:param text: html_content
:param xpath_dict: 清除目標(biāo)xpath
:return: string type html_content
'''
remove_by_xpath = xpath_dict if xpath_dict else dict()
# 必然清除的項(xiàng)目 除非極端情況 一般這些都是要清除的
remove_by_xpath.update({
'_remove_2': '//iframe',
'_remove_4': '//button',
'_remove_5': '//form',
'_remove_6': '//input',
'_remove_7': '//select',
'_remove_8': '//option',
'_remove_9': '//textarea',
'_remove_10': '//figure',
'_remove_11': '//figcaption',
'_remove_12': '//frame',
'_remove_13': '//video',
'_remove_14': '//script',
'_remove_15': '//style'
})
parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True)
selector = etree.HTML(text, parser=parser)
# 常規(guī)刪除操作,不需要的標(biāo)簽刪除
for xpath in remove_by_xpath.values():
for bad in selector.xpath(xpath):
bad_string = etree.tostring(bad, encoding='utf-8',
pretty_print=True).decode()
logger.debug(f"clean article content : {bad_string}")
bad.getparent().remove(bad)
skip_tip = "name()='img' or name()='tr' or " \
"name()='th' or name()='tbody' or " \
"name()='thead' or name()='table'"
# 判斷所有p標(biāo)簽,是否有內(nèi)容存在,沒(méi)有的直接刪除
for p in selector.xpath(f"//*[not({skip_tip})]"):
# 跳過(guò)邏輯
if p.xpath(f".//*[{skip_tip}]") or \
bool(re.sub('\s', '', p.xpath('string(.)'))):
continue
bad_p = etree.tostring(p, encoding='utf-8',
pretty_print=True).decode()
logger.debug(f"clean p tag : {bad_p}")
p.getparent().remove(p)
return etree.tostring(selector, encoding='utf-8',
pretty_print=True).decode()2,使用pyquery清理標(biāo)簽屬性,并返回處理后源碼和純凈文本
#!/usr/bin/env python
# -*-coding:utf-8-*-
from pyquery import PyQuery as pq
def pyquery_clean(self, text, url, pq_dict) -> object:
'''
pyquery 做出必要的處理,
:param text:
:param url:
:param pq_dict:
:return:
'''
# 刪除pq表達(dá)式字典
remove_by_pq = pq_dict if pq_dict else dict()
# 標(biāo)簽屬性白名單
attr_white_list = ['rowspan', 'colspan']
# 圖片鏈接key
img_key_list = ['src', 'data-echo', 'data-src', 'data-original']
# 生成pyquery對(duì)象
dom = pq(text)
# 刪除無(wú)用標(biāo)簽
for bad_tag in remove_by_pq.values():
for bad in dom(bad_tag):
bad_string = pq(bad).html()
logger.debug(f"clean article content : {bad_string}")
dom.remove(bad_tag)
# 標(biāo)簽各個(gè)屬性處理
for tag in dom('*'):
for key, value in tag.attrib.items():
# 跳過(guò)邏輯,保留表格的rowspan和colspan屬性
if key in attr_white_list:
continue
# 處理圖片鏈接,不完整url,補(bǔ)充完整后替換
if key in img_key_list:
img_url = self.absolute_url(url, value)
pq(tag).remove_attr(key)
pq(tag).attr('src', img_url)
pq(tag).attr('alt', '')
# img標(biāo)簽的alt屬性保留為空
elif key == 'alt':
pq(tag).attr(key, '')
# 其余所有屬性做刪除操作
else:
pq(tag).remove_attr(key)
return dom.text(), dom.html()3,正則表達(dá)清理空格以及換行符內(nèi)容
#!/usr/bin/env python
# -*-coding:utf-8-*-
import re
def regular_clean(self, str1: str, str2: str):
'''
正則表達(dá)式處理數(shù)據(jù)格式
:param str1: content
:param str2: html_content
:return: 返回處理后的結(jié)果
'''
def new_line(text):
text = re.sub('<br\s?/?>', '<br>', text)
text = re.sub(
'</?a>|</?em>|</?html>|</?body>|'
'</?head>|<[a-zA-Z]{1,10}\s?/>|'
'</?strong>|</?blockquote>|</?b>|'
'</?span>|</?i>|</?hr>|</?font>',
'',
text)
text = re.sub('\n', '', text)
text = re.sub('<h[1-6]>', '<p>', text)
text = re.sub('</h[1-6]>', '</p>', text)
text = text.replace('</p>', '</p>\n').replace('<br>', '<br/>')
return text
str1, str2 = self.clean_blank(str1), self.clean_blank(str2) # TODO 處理空白行問(wèn)題
# TODO html_content處理 1,刪除多余的無(wú)法使用的標(biāo)簽以及影響數(shù)據(jù)展示的標(biāo)簽 2,換行符問(wèn)題處理以及更換
str2 = new_line(text=str2)
return str1, str2結(jié)尾部分,各個(gè)方法封裝類(lèi)代碼展示
#!/usr/bin/env python
# -*-coding:utf-8-*-
'''
author: szhan
date:2020-08-17
summery: 清理html_conent以及獲取純凈數(shù)據(jù)格式
'''
import re
from lxml import etree
from pyquery import PyQuery as pq
from urllib.parse import urlsplit, urljoin
from loguru import logger
class CleanArticle:
def __init__(
self,
text: str,
url: str = '',
xpath_dict: dict = None,
pq_dict: dict = None
):
self.text = text
self.url = url
self.xpath_dict = xpath_dict or dict()
self.pq_dict = pq_dict or dict()
@staticmethod
def absolute_url(baseurl: str, url: str) -> str:
'''
補(bǔ)充url
:param baseurl:scheme url
:param url: target url
:return: complete url
'''
target_url = url if urlsplit(url).scheme else urljoin(baseurl, url)
return target_url
@staticmethod
def clean_blank(text):
'''
空白處理
:param text:
:return:
'''
text = text.replace(' ', '').replace('\u3000', '').replace('\t', '').replace('\xa0', '')
text = re.sub('\s{2,}', '', text)
text = re.sub('\n{2,}', '\n', text)
text = text.strip('\n').strip()
return text
def run(self):
'''
:return:處理后的content, html_content
'''
if (not bool(self.text)) or (not isinstance(self.text, str)):
raise ValueError('html_content has a bad type value')
# 首先,使用xpath去除空格,以及注釋?zhuān)琲frame, button, form, script, style, video等標(biāo)簽
text = self.xpath_clean(self.text, self.xpath_dict)
# 第二步,使用pyquery處理具體細(xì)節(jié)方面
str1, str2 = self.pyquery_clean(text, self.url, self.pq_dict)
# 最終的正則處理
content, html_content = self.regular_clean(str1, str2)
return content, html_content
def xpath_clean(self, text: str, xpath_dict: dict) -> str:
'''
xpath 清除不必要的元素
:param text: html_content
:param xpath_dict: 清除目標(biāo)xpath
:return: string type html_content
'''
remove_by_xpath = xpath_dict if xpath_dict else dict()
# 必然清除的項(xiàng)目 除非極端情況 一般這些都是要清除的
remove_by_xpath.update({
'_remove_2': '//iframe',
'_remove_4': '//button',
'_remove_5': '//form',
'_remove_6': '//input',
'_remove_7': '//select',
'_remove_8': '//option',
'_remove_9': '//textarea',
'_remove_10': '//figure',
'_remove_11': '//figcaption',
'_remove_12': '//frame',
'_remove_13': '//video',
'_remove_14': '//script',
'_remove_15': '//style'
})
parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True)
selector = etree.HTML(text, parser=parser)
# 常規(guī)刪除操作,不需要的標(biāo)簽刪除
for xpath in remove_by_xpath.values():
for bad in selector.xpath(xpath):
bad_string = etree.tostring(bad, encoding='utf-8',
pretty_print=True).decode()
logger.debug(f"clean article content : {bad_string}")
bad.getparent().remove(bad)
skip_tip = "name()='img' or name()='tr' or " \
"name()='th' or name()='tbody' or " \
"name()='thead' or name()='table'"
# 判斷所有p標(biāo)簽,是否有內(nèi)容存在,沒(méi)有的直接刪除
for p in selector.xpath(f"//*[not({skip_tip})]"):
# 跳過(guò)邏輯
if p.xpath(f".//*[{skip_tip}]") or \
bool(re.sub('\s', '', p.xpath('string(.)'))):
continue
bad_p = etree.tostring(p, encoding='utf-8',
pretty_print=True).decode()
logger.debug(f"clean p tag : {bad_p}")
p.getparent().remove(p)
return etree.tostring(selector, encoding='utf-8',
pretty_print=True).decode()
def pyquery_clean(self, text, url, pq_dict) -> object:
'''
pyquery 做出必要的處理,
:param text:
:param url:
:param pq_dict:
:return:
'''
# 刪除pq表達(dá)式字典
remove_by_pq = pq_dict if pq_dict else dict()
# 標(biāo)簽屬性白名單
attr_white_list = ['rowspan', 'colspan']
# 圖片鏈接key
img_key_list = ['src', 'data-echo', 'data-src', 'data-original']
# 生成pyquery對(duì)象
dom = pq(text)
# 刪除無(wú)用標(biāo)簽
for bad_tag in remove_by_pq.values():
for bad in dom(bad_tag):
bad_string = pq(bad).html()
logger.debug(f"clean article content : {bad_string}")
dom.remove(bad_tag)
# 標(biāo)簽各個(gè)屬性處理
for tag in dom('*'):
for key, value in tag.attrib.items():
# 跳過(guò)邏輯,保留表格的rowspan和colspan屬性
if key in attr_white_list:
continue
# 處理圖片鏈接,不完整url,補(bǔ)充完整后替換
if key in img_key_list:
img_url = self.absolute_url(url, value)
pq(tag).remove_attr(key)
pq(tag).attr('src', img_url)
pq(tag).attr('alt', '')
# img標(biāo)簽的alt屬性保留為空
elif key == 'alt':
pq(tag).attr(key, '')
# 其余所有屬性做刪除操作
else:
pq(tag).remove_attr(key)
return dom.text(), dom.html()
def regular_clean(self, str1: str, str2: str):
'''
正則表達(dá)式處理數(shù)據(jù)格式
:param str1: content
:param str2: html_content
:return: 返回處理后的結(jié)果
'''
def new_line(text):
text = re.sub('<br\s?/?>', '<br>', text)
text = re.sub(
'</?a>|</?em>|</?html>|</?body>|'
'</?head>|<[a-zA-Z]{1,10}\s?/>|'
'</?strong>|</?blockquote>|</?b>|'
'</?span>|</?i>|</?hr>|</?font>',
'',
text)
text = re.sub('\n', '', text)
text = re.sub('<h[1-6]>', '<p>', text)
text = re.sub('</h[1-6]>', '</p>', text)
text = text.replace('</p>', '</p>\n').replace('<br>', '<br/>')
return text
str1, str2 = self.clean_blank(str1), self.clean_blank(str2) # TODO 處理空白行問(wèn)題
# TODO html_content處理 1,刪除多余的無(wú)法使用的標(biāo)簽以及影響數(shù)據(jù)展示的標(biāo)簽 2,換行符問(wèn)題處理以及更換
str2 = new_line(text=str2)
return str1, str2
if __name__ == '__main__':
with open('html_content.html', 'r', encoding='utf-8') as f:
lines = f.readlines()
html = ''
for line in lines:
html += line
ca = CleanArticle(text=html)
_, html_content = ca.run()
print(html_content)感謝各位的閱讀!關(guān)于“基于xpath選擇器、PyQuery、正則表達(dá)式格式清理工具的示例分析”這篇文章就分享到這里了,希望以上內(nèi)容可以對(duì)大家有一定的幫助,讓大家可以學(xué)到更多知識(shí),如果覺(jué)得文章不錯(cuò),可以把它分享出去讓更多的人看到吧!
分享名稱(chēng):基于xpath選擇器、PyQuery、正則表達(dá)式格式清理工具的示例分析-創(chuàng)新互聯(lián)
轉(zhuǎn)載源于:http://chinadenli.net/article38/eoesp.html
成都網(wǎng)站建設(shè)公司_創(chuàng)新互聯(lián),為您提供定制開(kāi)發(fā)、網(wǎng)站內(nèi)鏈、移動(dòng)網(wǎng)站建設(shè)、網(wǎng)站設(shè)計(jì)、用戶(hù)體驗(yàn)、網(wǎng)站營(yíng)銷(xiāo)
聲明:本網(wǎng)站發(fā)布的內(nèi)容(圖片、視頻和文字)以用戶(hù)投稿、用戶(hù)轉(zhuǎn)載內(nèi)容為主,如果涉及侵權(quán)請(qǐng)盡快告知,我們將會(huì)在第一時(shí)間刪除。文章觀點(diǎn)不代表本網(wǎng)站立場(chǎng),如需處理請(qǐng)聯(lián)系客服。電話(huà):028-86922220;郵箱:631063699@qq.com。內(nèi)容未經(jīng)允許不得轉(zhuǎn)載,或轉(zhuǎn)載時(shí)需注明來(lái)源: 創(chuàng)新互聯(lián)
猜你還喜歡下面的內(nèi)容