1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

make cleaning more obvious

This commit is contained in:
Jan Lukas Gernert 2023-03-19 23:09:06 +01:00
parent 11e08ae505
commit 280c516cbe
4 changed files with 27 additions and 23 deletions

View file

@ -22,9 +22,9 @@
<td><span size="-1">( <a href="http://www.origami-club.com/index.html" target="_blank">おりがみくらぶ</a> より)</span></td> <td><span size="-1">( <a href="http://www.origami-club.com/index.html" target="_blank">おりがみくらぶ</a> より)</span></td>
<td><img src="http://fakehost/366/logo_bana/corner_2.gif" width="7" height="7"></td> <td><img src="http://fakehost/366/logo_bana/corner_2.gif" width="7" height="7"></td>
</tr> </tr>
<tr><td colspan="4"><P> <tr><td colspan="4"><table><tbody><tr><td>
<a href="http://www.origami-club.com/easy/dogfase/index.html" target="_blank"><span size="+2"><img src="http://fakehost/gazou/origami_gazou/kantan/dogface.gif" alt="犬の顔の折り紙" width="73" height="51">いぬのかお</span></a><a href="http://www.origami-club.com/easy/dog/index.html" target="_blank"><img src="http://fakehost/gazou/origami_gazou/kantan/dog.gif" alt="犬の顔の紙" width="62" height="43"><span size="+2">いぬ</span></a> <a href="http://www.origami-club.com/easy/dogfase/index.html" target="_blank"><span size="+2"><img src="http://fakehost/gazou/origami_gazou/kantan/dogface.gif" alt="犬の顔の折り紙" width="73" height="51">いぬのかお</span></a><a href="http://www.origami-club.com/easy/dog/index.html" target="_blank"><img src="http://fakehost/gazou/origami_gazou/kantan/dog.gif" alt="犬の顔の紙" width="62" height="43"><span size="+2">いぬ</span></a>
</P></td></tr> </td></tr></tbody></table></td></tr>
</tbody></table></DIV> </tbody></table></DIV>
<table><tbody> <table><tbody>
<tr><td> <tr><td>

View file

@ -125,9 +125,7 @@ impl FullTextParser {
return Err(error); return Err(error);
} }
if let Some(mut root) = document.get_root_element() { Self::post_process_document(&document)?;
Self::post_process_content(&mut root, false)?;
}
article.document = Some(document); article.document = Some(document);
@ -784,7 +782,7 @@ impl FullTextParser {
return Err(FullTextParserError::Xml); return Err(FullTextParserError::Xml);
} }
Self::post_process_content(&mut node, true)?; Self::post_process_page(&mut node)?;
node.unlink(); node.unlink();
if root.add_child(&mut node).is_ok() { if root.add_child(&mut node).is_ok() {
@ -857,20 +855,29 @@ impl FullTextParser {
Ok(()) Ok(())
} }
pub(crate) fn post_process_content( pub(crate) fn post_process_document(
node: &mut Node, document: &Document
clean_conditionally: bool,
) -> Result<(), FullTextParserError> { ) -> Result<(), FullTextParserError> {
if clean_conditionally { if let Some(mut root) = document.get_root_element() {
Util::clean_conditionally(node, "fieldset"); Self::simplify_nested_elements(&mut root)?;
Util::clean_conditionally(node, "table");
Util::clean_conditionally(node, "ul"); Self::clean_attributes(&mut root)?;
Util::clean_conditionally(node, "div"); Self::remove_single_cell_tables(&mut root);
Self::remove_extra_p_and_div(&mut root);
} }
Self::clean_attributes(node)?; Ok(())
Self::simplify_nested_elements(node)?; }
pub(crate) fn post_process_page(
node: &mut Node,
) -> Result<(), FullTextParserError> {
Util::clean_conditionally(node, "fieldset");
Util::clean_conditionally(node, "table");
Util::clean_conditionally(node, "ul");
Util::clean_conditionally(node, "div");
Self::clean_attributes(node)?;
Self::remove_single_cell_tables(node); Self::remove_single_cell_tables(node);
Self::remove_extra_p_and_div(node); Self::remove_extra_p_and_div(node);

View file

@ -497,10 +497,9 @@ impl Readability {
} }
} }
crate::FullTextParser::post_process_content( if state.clean_conditionally {
&mut article_content, crate::FullTextParser::post_process_page(&mut article_content)?;
state.clean_conditionally, }
)?;
if needed_to_create_top_candidate { if needed_to_create_top_candidate {
// We already created a fake div thing, and there wouldn't have been any siblings left // We already created a fake div thing, and there wouldn't have been any siblings left

View file

@ -34,9 +34,7 @@ async fn run_test(name: &str) {
metadata::extract(&xpath_ctx, None, None, &mut article); metadata::extract(&xpath_ctx, None, None, &mut article);
super::Readability::extract_body(document, &mut root, article.title.as_deref()).unwrap(); super::Readability::extract_body(document, &mut root, article.title.as_deref()).unwrap();
if let Some(mut root) = article_document.get_root_element() { crate::FullTextParser::post_process_document(&article_document).unwrap();
crate::FullTextParser::post_process_content(&mut root, false).unwrap();
}
article.document = Some(article_document); article.document = Some(article_document);
let html = article.get_content().unwrap(); let html = article.get_content().unwrap();