mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
make cleaning more obvious
This commit is contained in:
parent
11e08ae505
commit
280c516cbe
4 changed files with 27 additions and 23 deletions
|
@ -22,9 +22,9 @@
|
|||
<td><span size="-1">( <a href="http://www.origami-club.com/index.html" target="_blank">おりがみくらぶ</a> より)</span></td>
|
||||
<td><img src="http://fakehost/366/logo_bana/corner_2.gif" width="7" height="7"></td>
|
||||
</tr>
|
||||
<tr><td colspan="4"><P>
|
||||
<tr><td colspan="4"><table><tbody><tr><td>
|
||||
<a href="http://www.origami-club.com/easy/dogfase/index.html" target="_blank"><span size="+2"><img src="http://fakehost/gazou/origami_gazou/kantan/dogface.gif" alt="犬の顔の折り紙" width="73" height="51">いぬのかお</span></a><a href="http://www.origami-club.com/easy/dog/index.html" target="_blank"><img src="http://fakehost/gazou/origami_gazou/kantan/dog.gif" alt="犬の顔の紙" width="62" height="43"><span size="+2">いぬ</span></a>
|
||||
</P></td></tr>
|
||||
</td></tr></tbody></table></td></tr>
|
||||
</tbody></table></DIV>
|
||||
<table><tbody>
|
||||
<tr><td>
|
||||
|
|
|
@ -125,9 +125,7 @@ impl FullTextParser {
|
|||
return Err(error);
|
||||
}
|
||||
|
||||
if let Some(mut root) = document.get_root_element() {
|
||||
Self::post_process_content(&mut root, false)?;
|
||||
}
|
||||
Self::post_process_document(&document)?;
|
||||
|
||||
article.document = Some(document);
|
||||
|
||||
|
@ -784,7 +782,7 @@ impl FullTextParser {
|
|||
return Err(FullTextParserError::Xml);
|
||||
}
|
||||
|
||||
Self::post_process_content(&mut node, true)?;
|
||||
Self::post_process_page(&mut node)?;
|
||||
|
||||
node.unlink();
|
||||
if root.add_child(&mut node).is_ok() {
|
||||
|
@ -857,20 +855,29 @@ impl FullTextParser {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn post_process_content(
|
||||
node: &mut Node,
|
||||
clean_conditionally: bool,
|
||||
pub(crate) fn post_process_document(
|
||||
document: &Document
|
||||
) -> Result<(), FullTextParserError> {
|
||||
if clean_conditionally {
|
||||
Util::clean_conditionally(node, "fieldset");
|
||||
Util::clean_conditionally(node, "table");
|
||||
Util::clean_conditionally(node, "ul");
|
||||
Util::clean_conditionally(node, "div");
|
||||
if let Some(mut root) = document.get_root_element() {
|
||||
Self::simplify_nested_elements(&mut root)?;
|
||||
|
||||
Self::clean_attributes(&mut root)?;
|
||||
Self::remove_single_cell_tables(&mut root);
|
||||
Self::remove_extra_p_and_div(&mut root);
|
||||
}
|
||||
|
||||
Self::clean_attributes(node)?;
|
||||
Self::simplify_nested_elements(node)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn post_process_page(
|
||||
node: &mut Node,
|
||||
) -> Result<(), FullTextParserError> {
|
||||
Util::clean_conditionally(node, "fieldset");
|
||||
Util::clean_conditionally(node, "table");
|
||||
Util::clean_conditionally(node, "ul");
|
||||
Util::clean_conditionally(node, "div");
|
||||
|
||||
Self::clean_attributes(node)?;
|
||||
Self::remove_single_cell_tables(node);
|
||||
Self::remove_extra_p_and_div(node);
|
||||
|
||||
|
|
|
@ -497,10 +497,9 @@ impl Readability {
|
|||
}
|
||||
}
|
||||
|
||||
crate::FullTextParser::post_process_content(
|
||||
&mut article_content,
|
||||
state.clean_conditionally,
|
||||
)?;
|
||||
if state.clean_conditionally {
|
||||
crate::FullTextParser::post_process_page(&mut article_content)?;
|
||||
}
|
||||
|
||||
if needed_to_create_top_candidate {
|
||||
// We already created a fake div thing, and there wouldn't have been any siblings left
|
||||
|
|
|
@ -34,9 +34,7 @@ async fn run_test(name: &str) {
|
|||
|
||||
metadata::extract(&xpath_ctx, None, None, &mut article);
|
||||
super::Readability::extract_body(document, &mut root, article.title.as_deref()).unwrap();
|
||||
if let Some(mut root) = article_document.get_root_element() {
|
||||
crate::FullTextParser::post_process_content(&mut root, false).unwrap();
|
||||
}
|
||||
crate::FullTextParser::post_process_document(&article_document).unwrap();
|
||||
|
||||
article.document = Some(article_document);
|
||||
let html = article.get_content().unwrap();
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue