package readability
import (
"bytes"
"fmt"
ghtml "html"
"io"
"math"
"net/http"
nurl "net/url"
"regexp"
"strconv"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
wl "github.com/abadojack/whatlanggo"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
var (
dataTableAttr = "XXX-DATA-TABLE"
rxSpaces = regexp.MustCompile(`(?is)\s{2,}|\n+`)
rxReplaceBrs = regexp.MustCompile(`(?is)(
]*>[ \n\r\t]*){2,}`)
rxByline = regexp.MustCompile(`(?is)byline|author|dateline|writtenby|p-author`)
rxUnlikelyCandidates = regexp.MustCompile(`(?is)banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`)
rxOkMaybeItsACandidate = regexp.MustCompile(`(?is)and|article|body|column|main|shadow`)
rxUnlikelyElements = regexp.MustCompile(`(?is)(input|time|button)`)
rxDivToPElements = regexp.MustCompile(`(?is)<(a|blockquote|dl|div|img|ol|p|pre|table|ul|select)`)
rxPositive = regexp.MustCompile(`(?is)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`)
rxNegative = regexp.MustCompile(`(?is)hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget`)
rxPIsSentence = regexp.MustCompile(`(?is)\.( |$)`)
rxVideos = regexp.MustCompile(`(?is)//(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com`)
rxKillBreaks = regexp.MustCompile(`(?is)(
(\s| ?)*)+`)
rxComments = regexp.MustCompile(`(?is)`)
)
type candidateItem struct {
score float64
node *goquery.Selection
}
type readability struct {
html string
url *nurl.URL
candidates map[string]candidateItem
}
// Metadata is metadata of an article
type Metadata struct {
Title string
Image string
Excerpt string
Author string
MinReadTime int
MaxReadTime int
}
// Article is the content of an URL
type Article struct {
URL string
Meta Metadata
Content string
RawContent string
}
// removeScripts removes script tags from the document.
func removeScripts(doc *goquery.Document) {
doc.Find("script").Remove()
doc.Find("noscript").Remove()
}
// replaceBrs replaces 2 or more successive
elements with a single
.
// Whitespace between
elements are ignored. For example:
//
abc
") body.SetHtml(html) // Remove empty p body.Find("p").Each(func(_ int, p *goquery.Selection) { html, _ := p.Html() html = strings.TrimSpace(html) if html == "" { p.Remove() } }) } // prepDocument prepares the HTML document for readability to scrape it. // This includes things like stripping JS, CSS, and handling terrible markup. func prepDocument(doc *goquery.Document) { // Remove all style tags in head doc.Find("style").Remove() // Replace all br replaceBrs(doc) // Replace font tags to span doc.Find("font").Each(func(_ int, font *goquery.Selection) { html, _ := font.Html() font.ReplaceWithHtml("" + html + "") }) } // getArticleTitle fetchs the article title func getArticleTitle(doc *goquery.Document) string { // Get title tag title := doc.Find("title").First().Text() title = normalizeText(title) originalTitle := title // Create list of separator separators := []string{`|`, `-`, `\`, `/`, `>`, `»`} hierarchialSeparators := []string{`\`, `/`, `>`, `»`} // If there's a separator in the title, first remove the final part titleHadHierarchicalSeparators := false if idx, sep := findSeparator(title, separators...); idx != -1 { titleHadHierarchicalSeparators = hasSeparator(title, hierarchialSeparators...) index := strings.LastIndex(originalTitle, sep) title = originalTitle[:index] // If the resulting title is too short (3 words or fewer), remove // the first part instead: if len(strings.Fields(title)) < 3 { index = strings.Index(originalTitle, sep) title = originalTitle[index+1:] } } else if strings.Contains(title, ": ") { // Check if we have an heading containing this exact string, so we // could assume it's the full title. existInHeading := false doc.Find("h1,h2").EachWithBreak(func(_ int, heading *goquery.Selection) bool { headingText := strings.TrimSpace(heading.Text()) if headingText == title { existInHeading = true return false } return true }) // If we don't, let's extract the title out of the original title string. if !existInHeading { index := strings.LastIndex(originalTitle, ":") title = originalTitle[index+1:] // If the title is now too short, try the first colon instead: if len(strings.Fields(title)) < 3 { index = strings.Index(originalTitle, ":") title = originalTitle[:index] // But if we have too many words before the colon there's something weird // with the titles and the H tags so let's just use the original title instead } else { index = strings.Index(originalTitle, ":") beforeColon := originalTitle[:index] if len(strings.Fields(beforeColon)) > 5 { title = originalTitle } } } } else if strLen(title) > 150 || strLen(title) < 15 { hOne := doc.Find("h1").First() if hOne != nil { title = hOne.Text() } } // If we now have 4 words or fewer as our title, and either no // 'hierarchical' separators (\, /, > or ») were found in the original // title or we decreased the number of words by more than 1 word, use // the original title. curTitleWordCount := len(strings.Fields(title)) noSeparatorWordCount := len(strings.Fields(removeSeparator(originalTitle, separators...))) if curTitleWordCount <= 4 && (!titleHadHierarchicalSeparators || curTitleWordCount != noSeparatorWordCount-1) { title = originalTitle } return normalizeText(title) } // getArticleMetadata attempts to get excerpt and byline metadata for the article. func getArticleMetadata(doc *goquery.Document) Metadata { metadata := Metadata{} mapAttribute := make(map[string]string) doc.Find("meta").Each(func(_ int, meta *goquery.Selection) { metaName, _ := meta.Attr("name") metaProperty, _ := meta.Attr("property") metaContent, _ := meta.Attr("content") metaName = strings.TrimSpace(metaName) metaProperty = strings.TrimSpace(metaProperty) metaContent = strings.TrimSpace(metaContent) // Fetch author name if strings.Contains(metaName+metaProperty, "author") { metadata.Author = metaContent return } // Fetch description and title if metaName == "title" || metaName == "description" || metaName == "twitter:title" || metaName == "twitter:image" || metaName == "twitter:description" { if _, exist := mapAttribute[metaName]; !exist { mapAttribute[metaName] = metaContent } return } if metaProperty == "og:description" || metaProperty == "og:image" || metaProperty == "og:title" { if _, exist := mapAttribute[metaProperty]; !exist { mapAttribute[metaProperty] = metaContent } return } }) // Set final image if _, exist := mapAttribute["og:image"]; exist { metadata.Image = mapAttribute["og:image"] } else if _, exist := mapAttribute["twitter:image"]; exist { metadata.Image = mapAttribute["twitter:image"] } if metadata.Image != "" && strings.HasPrefix(metadata.Image, "//") { metadata.Image = "http:" + metadata.Image } // Set final excerpt if _, exist := mapAttribute["description"]; exist { metadata.Excerpt = mapAttribute["description"] } else if _, exist := mapAttribute["og:description"]; exist { metadata.Excerpt = mapAttribute["og:description"] } else if _, exist := mapAttribute["twitter:description"]; exist { metadata.Excerpt = mapAttribute["twitter:description"] } // Set final title metadata.Title = getArticleTitle(doc) if metadata.Title == "" { if _, exist := mapAttribute["og:title"]; exist { metadata.Title = mapAttribute["og:title"] } else if _, exist := mapAttribute["twitter:title"]; exist { metadata.Title = mapAttribute["twitter:title"] } } // Clean up the metadata metadata.Title = normalizeText(metadata.Title) metadata.Excerpt = normalizeText(metadata.Excerpt) return metadata } // isValidByline checks whether the input string could be a byline. // This verifies that the input is a string, and that the length // is less than 100 chars. func isValidByline(str string) bool { return strLen(str) > 0 && strLen(str) < 100 } func isElementWithoutContent(s *goquery.Selection) bool { if s == nil { return true } html, _ := s.Html() html = strings.TrimSpace(html) return html == "" } // hasSinglePInsideElement checks if this node has only whitespace and a single P element. // Returns false if the DIV node contains non-empty text nodes // or if it contains no P or more than 1 element. func hasSinglePInsideElement(s *goquery.Selection) bool { // There should be exactly 1 element child which is a P return s.Children().Length() == 1 && s.Children().First().Is("p") } // hasChildBlockElement determines whether element has any children // block level elements. func hasChildBlockElement(s *goquery.Selection) bool { html, _ := s.Html() return rxDivToPElements.MatchString(html) } func setNodeTag(s *goquery.Selection, tag string) { html, _ := s.Html() newHTML := fmt.Sprintf("<%s>%s%s>", tag, html, tag) s.ReplaceWithHtml(newHTML) } func getNodeAncestors(node *goquery.Selection, maxDepth int) []*goquery.Selection { ancestors := []*goquery.Selection{} parent := node for i := 0; i < maxDepth; i++ { parent = parent.Parent() if len(parent.Nodes) == 0 { return ancestors } ancestors = append(ancestors, parent) } return ancestors } func hasAncestorTag(node *goquery.Selection, tag string, maxDepth int) (*goquery.Selection, bool) { parent := node if maxDepth < 0 { maxDepth = 100 } for i := 0; i < maxDepth; i++ { parent = parent.Parent() if len(parent.Nodes) == 0 { break } if parent.Is(tag) { return parent, true } } return nil, false } // initializeNodeScore initializes a node and checks the className/id // for special names to add to its score. func initializeNodeScore(node *goquery.Selection) candidateItem { contentScore := 0.0 tagName := goquery.NodeName(node) switch strings.ToLower(tagName) { case "article": contentScore += 10 case "section": contentScore += 8 case "div": contentScore += 5 case "pre", "blockquote", "td": contentScore += 3 case "form", "ol", "ul", "dl", "dd", "dt", "li", "address": contentScore -= 3 case "th", "h1", "h2", "h3", "h4", "h5", "h6": contentScore -= 5 } contentScore += getClassWeight(node) return candidateItem{contentScore, node} } // getClassWeight gets an elements class/id weight. // Uses regular expressions to tell if this element looks good or bad. func getClassWeight(node *goquery.Selection) float64 { weight := 0.0 if str, b := node.Attr("class"); b { if rxNegative.MatchString(str) { weight -= 25 } if rxPositive.MatchString(str) { weight += 25 } } if str, b := node.Attr("id"); b { if rxNegative.MatchString(str) { weight -= 25 } if rxPositive.MatchString(str) { weight += 25 } } return weight } // getLinkDensity gets the density of links as a percentage of the content // This is the amount of text that is inside a link divided by the total text in the node. func getLinkDensity(node *goquery.Selection) float64 { textLength := strLen(normalizeText(node.Text())) if textLength == 0 { return 0 } linkLength := 0 node.Find("a").Each(func(_ int, link *goquery.Selection) { linkLength += strLen(link.Text()) }) return float64(linkLength) / float64(textLength) } // Remove the style attribute on every e and under. func cleanStyle(s *goquery.Selection) { s.Find("*").Each(func(i int, s1 *goquery.Selection) { tagName := goquery.NodeName(s1) if strings.ToLower(tagName) == "svg" { return } s1.RemoveAttr("align") s1.RemoveAttr("background") s1.RemoveAttr("bgcolor") s1.RemoveAttr("border") s1.RemoveAttr("cellpadding") s1.RemoveAttr("cellspacing") s1.RemoveAttr("frame") s1.RemoveAttr("hspace") s1.RemoveAttr("rules") s1.RemoveAttr("style") s1.RemoveAttr("valign") s1.RemoveAttr("vspace") s1.RemoveAttr("onclick") s1.RemoveAttr("onmouseover") s1.RemoveAttr("border") s1.RemoveAttr("style") if tagName != "table" && tagName != "th" && tagName != "td" && tagName != "hr" && tagName != "pre" { s1.RemoveAttr("width") s1.RemoveAttr("height") } }) } // Return an object indicating how many rows and columns this table has. func getTableRowAndColumnCount(table *goquery.Selection) (int, int) { rows := 0 columns := 0 table.Find("tr").Each(func(_ int, tr *goquery.Selection) { // Look for rows strRowSpan, _ := tr.Attr("rowspan") rowSpan, err := strconv.Atoi(strRowSpan) if err != nil { rowSpan = 1 } rows += rowSpan // Now look for columns columnInThisRow := 0 tr.Find("td").Each(func(_ int, td *goquery.Selection) { strColSpan, _ := tr.Attr("colspan") colSpan, err := strconv.Atoi(strColSpan) if err != nil { colSpan = 1 } columnInThisRow += colSpan }) if columnInThisRow > columns { columns = columnInThisRow } }) return rows, columns } // Look for 'data' (as opposed to 'layout') tables func markDataTables(s *goquery.Selection) { s.Find("table").Each(func(_ int, table *goquery.Selection) { role, _ := table.Attr("role") if role == "presentation" { return } datatable, _ := table.Attr("datatable") if datatable == "0" { return } _, summaryExist := table.Attr("summary") if summaryExist { table.SetAttr(dataTableAttr, "1") return } caption := table.Find("caption").First() if len(caption.Nodes) > 0 && caption.Children().Length() > 0 { table.SetAttr(dataTableAttr, "1") return } // If the table has a descendant with any of these tags, consider a data table: dataTableDescendants := []string{"col", "colgroup", "tfoot", "thead", "th"} for _, tag := range dataTableDescendants { if table.Find(tag).Length() > 0 { table.SetAttr(dataTableAttr, "1") return } } // Nested tables indicate a layout table: if table.Find("table").Length() > 0 { return } nRow, nColumn := getTableRowAndColumnCount(table) if nRow >= 10 || nColumn > 4 { table.SetAttr(dataTableAttr, "1") return } // Now just go by size entirely: if nRow*nColumn > 10 { table.SetAttr(dataTableAttr, "1") return } }) } // Clean an element of all tags of type "tag" if they look fishy. // "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. func cleanConditionally(e *goquery.Selection, tag string) { isList := tag == "ul" || tag == "ol" e.Find(tag).Each(func(i int, node *goquery.Selection) { // First check if we're in a data table, in which case don't remove it if ancestor, hasTag := hasAncestorTag(node, "table", -1); hasTag { if attr, _ := ancestor.Attr(dataTableAttr); attr == "1" { return } } // If it is table, remove data table marker if tag == "table" { node.RemoveAttr(dataTableAttr) } contentScore := 0.0 weight := getClassWeight(node) if weight+contentScore < 0 { node.Remove() return } // If there are not very many commas, and the number of // non-paragraph elements is more than paragraphs or other // ominous signs, remove the element. nodeText := normalizeText(node.Text()) nCommas := strings.Count(nodeText, ",") nCommas += strings.Count(nodeText, ",") if nCommas < 10 { p := node.Find("p").Length() img := node.Find("img").Length() li := node.Find("li").Length() - 100 input := node.Find("input").Length() embedCount := 0 node.Find("embed").Each(func(i int, embed *goquery.Selection) { if !rxVideos.MatchString(embed.AttrOr("src", "")) { embedCount++ } }) contentLength := strLen(nodeText) linkDensity := getLinkDensity(node) _, hasFigureAncestor := hasAncestorTag(node, "figure", 3) haveToRemove := (!isList && li > p) || (img > 1 && float64(p)/float64(img) < 0.5 && !hasFigureAncestor) || (float64(input) > math.Floor(float64(p)/3)) || (!isList && contentLength < 25 && (img == 0 || img > 2) && !hasFigureAncestor) || (!isList && weight < 25 && linkDensity > 0.2) || (weight >= 25 && linkDensity > 0.5) || ((embedCount == 1 && contentLength < 75) || embedCount > 1) if haveToRemove { node.Remove() } } }) } // Clean a node of all elements of type "tag". // (Unless it's a youtube/vimeo video. People love movies.) func clean(s *goquery.Selection, tag string) { isEmbed := tag == "object" || tag == "embed" || tag == "iframe" s.Find(tag).Each(func(i int, target *goquery.Selection) { attributeValues := "" for _, attribute := range target.Nodes[0].Attr { attributeValues += " " + attribute.Val } if isEmbed && rxVideos.MatchString(attributeValues) { return } if isEmbed && rxVideos.MatchString(target.Text()) { return } target.Remove() }) } // Clean out spurious headers from an Element. Checks things like classnames and link density. func cleanHeaders(s *goquery.Selection) { s.Find("h1,h2,h3").Each(func(_ int, s1 *goquery.Selection) { if getClassWeight(s1) < 0 { s1.Remove() } }) } // Prepare the article node for display. Clean out any inline styles, // iframes, forms, strip extraneous
tags, etc. func prepArticle(articleContent *goquery.Selection, articleTitle string) { if articleContent == nil { return } // Check for data tables before we continue, to avoid removing items in // those tables, which will often be isolated even though they're // visually linked to other content-ful elements (text, images, etc.). markDataTables(articleContent) // Remove style attribute cleanStyle(articleContent) // Clean out junk from the article content cleanConditionally(articleContent, "form") cleanConditionally(articleContent, "fieldset") clean(articleContent, "h1") clean(articleContent, "object") clean(articleContent, "embed") clean(articleContent, "footer") clean(articleContent, "link") // Clean out elements have "share" in their id/class combinations from final top candidates, // which means we don't remove the top candidates even they have "share". articleContent.Find("*").Each(func(_ int, s *goquery.Selection) { id, _ := s.Attr("id") class, _ := s.Attr("class") matchString := class + " " + id if strings.Contains(matchString, "share") { s.Remove() } }) // If there is only one h2 and its text content substantially equals article title, // they are probably using it as a header and not a subheader, // so remove it since we already extract the title separately. h2s := articleContent.Find("h2") if h2s.Length() == 1 { h2 := h2s.First() h2Text := normalizeText(h2.Text()) lengthSimilarRate := float64(strLen(h2Text)-strLen(articleTitle)) / float64(strLen(articleTitle)) if math.Abs(lengthSimilarRate) < 0.5 { titlesMatch := false if lengthSimilarRate > 0 { titlesMatch = strings.Contains(h2Text, articleTitle) } else { titlesMatch = strings.Contains(articleTitle, h2Text) } if titlesMatch { h2.Remove() } } } clean(articleContent, "iframe") clean(articleContent, "input") clean(articleContent, "textarea") clean(articleContent, "select") clean(articleContent, "button") cleanHeaders(articleContent) // Do these last as the previous stuff may have removed junk // that will affect these cleanConditionally(articleContent, "table") cleanConditionally(articleContent, "ul") cleanConditionally(articleContent, "div") // Remove extra paragraphs // At this point, nasty iframes have been removed, only remain embedded video ones. articleContent.Find("p").Each(func(_ int, p *goquery.Selection) { imgCount := p.Find("img").Length() embedCount := p.Find("embed").Length() objectCount := p.Find("object").Length() iframeCount := p.Find("iframe").Length() totalCount := imgCount + embedCount + objectCount + iframeCount pText := normalizeText(p.Text()) if totalCount == 0 && strLen(pText) == 0 { p.Remove() } }) articleContent.Find("br").Each(func(_ int, br *goquery.Selection) { if br.Next().Is("p") { br.Remove() } }) } // grabArticle fetch the articles using a variety of metrics (content score, classname, element types), // find the content that is most likely to be the stuff a user wants to read. // Then return it wrapped up in a div. func grabArticle(doc *goquery.Document, articleTitle string) (*goquery.Selection, string) { // Create initial variable author := "" elementsToScore := []*goquery.Selection{} // First, node prepping. Trash nodes that look cruddy (like ones with the // class name "comment", etc), and turn divs into P tags where they have been // used inappropriately (as in, where they contain no other block level elements.) doc.Find("*").Each(func(i int, s *goquery.Selection) { matchString := s.AttrOr("class", "") + " " + s.AttrOr("id", "") // If byline, remove this element if rel := s.AttrOr("rel", ""); rel == "author" || rxByline.MatchString(matchString) { text := s.Text() text = strings.TrimSpace(text) if isValidByline(text) { author = text s.Remove() return } } // Remove unlikely candidates if rxUnlikelyCandidates.MatchString(matchString) && !rxOkMaybeItsACandidate.MatchString(matchString) && !s.Is("body") && !s.Is("a") { s.Remove() return } if rxUnlikelyElements.MatchString(goquery.NodeName(s)) { s.Remove() return } // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). if s.Is("div,section,header,h1,h2,h3,h4,h5,h6") && isElementWithoutContent(s) { s.Remove() return } if s.Is("section,h2,h3,h4,h5,h6,p,td,pre") { elementsToScore = append(elementsToScore, s) } // Turn all divs that don't have children block level elements into p's if s.Is("div") { // Sites like http://mobile.slate.com encloses each paragraph with a DIV // element. DIVs with only a P element inside and no text content can be // safely converted into plain P elements to avoid confusing the scoring // algorithm with DIVs with are, in practice, paragraphs. if hasSinglePInsideElement(s) { newNode := s.Children().First() s.ReplaceWithSelection(newNode) elementsToScore = append(elementsToScore, s) } else if !hasChildBlockElement(s) { setNodeTag(s, "p") elementsToScore = append(elementsToScore, s) } } }) // Loop through all paragraphs, and assign a score to them based on how content-y they look. // Then add their score to their parent node. // A score is determined by things like number of commas, class names, etc. Maybe eventually link density. candidates := make(map[string]candidateItem) for _, s := range elementsToScore { // If this paragraph is less than 25 characters, don't even count it. innerText := normalizeText(s.Text()) if strLen(innerText) < 25 { continue } // Exclude nodes with no ancestor. ancestors := getNodeAncestors(s, 3) if len(ancestors) == 0 { continue } // Calculate content score // Add a point for the paragraph itself as a base. contentScore := 1.0 // Add points for any commas within this paragraph. contentScore += float64(strings.Count(innerText, ",")) contentScore += float64(strings.Count(innerText, ",")) // For every 100 characters in this paragraph, add another point. Up to 3 points. contentScore += math.Min(math.Floor(float64(strLen(innerText)/100)), 3) // Initialize and score ancestors. for level, ancestor := range ancestors { // Node score divider: // - parent: 1 (no division) // - grandparent: 2 // - great grandparent+: ancestor level * 3 scoreDivider := 0 if level == 0 { scoreDivider = 1 } else if level == 1 { scoreDivider = 2 } else { scoreDivider = level * 3 } ancestorHash := hashNode(ancestor) if _, ok := candidates[ancestorHash]; !ok { candidates[ancestorHash] = initializeNodeScore(ancestor) } candidate := candidates[ancestorHash] candidate.score += contentScore / float64(scoreDivider) candidates[ancestorHash] = candidate } } // Scale the final candidates score based on link density. Good content // should have a relatively small link density (5% or less) and be mostly // unaffected by this operation. topCandidate := candidateItem{} for hash, candidate := range candidates { candidate.score = candidate.score * (1 - getLinkDensity(candidate.node)) candidates[hash] = candidate if topCandidate.node == nil || candidate.score > topCandidate.score { topCandidate = candidate } } // If we still have no top candidate, use the body as a last resort. if topCandidate.node == nil { body := doc.Find("body").First() bodyHTML, _ := body.Html() newHTML := fmt.Sprintf(`