From 6ee5c22ae61eef80fd3f12770adfc2cf0f5b0fb3 Mon Sep 17 00:00:00 2001
From: Radhi Fadlillah .
-// Whitespace between abc
]*>[ \n\r\t]*){2,}`)
- rxByline = regexp.MustCompile(`(?is)byline|author|dateline|writtenby|p-author`)
- rxUnlikelyCandidates = regexp.MustCompile(`(?is)banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`)
- rxOkMaybeItsACandidate = regexp.MustCompile(`(?is)and|article|body|column|main|shadow`)
- rxUnlikelyElements = regexp.MustCompile(`(?is)(input|time|button)`)
- rxDivToPElements = regexp.MustCompile(`(?is)<(a|blockquote|dl|div|img|ol|p|pre|table|ul|select)`)
- rxPositive = regexp.MustCompile(`(?is)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`)
- rxNegative = regexp.MustCompile(`(?is)hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget`)
- rxPIsSentence = regexp.MustCompile(`(?is)\.( |$)`)
- rxVideos = regexp.MustCompile(`(?is)//(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com`)
- rxKillBreaks = regexp.MustCompile(`(?is)(
(\s| ?)*)+`)
- rxComments = regexp.MustCompile(`(?is)`)
-)
-
-type candidateItem struct {
- score float64
- node *goquery.Selection
-}
-
-type readability struct {
- html string
- url *nurl.URL
- candidates map[string]candidateItem
-}
-
-// Metadata is metadata of an article
-type Metadata struct {
- Title string
- Image string
- Excerpt string
- Author string
- MinReadTime int
- MaxReadTime int
-}
-
-// Article is the content of an URL
-type Article struct {
- URL string
- Meta Metadata
- Content string
- RawContent string
-}
-
-// removeScripts removes script tags from the document.
-func removeScripts(doc *goquery.Document) {
- doc.Find("script").Remove()
- doc.Find("noscript").Remove()
-}
-
-// replaceBrs replaces 2 or more successive
elements with a single
elements are ignored. For example:
-//
bar
abc
bar
") - - body.SetHtml(html) - - // Remove empty p - body.Find("p").Each(func(_ int, p *goquery.Selection) { - html, _ := p.Html() - html = strings.TrimSpace(html) - if html == "" { - p.Remove() - } - }) -} - -// prepDocument prepares the HTML document for readability to scrape it. -// This includes things like stripping JS, CSS, and handling terrible markup. -func prepDocument(doc *goquery.Document) { - // Remove all style tags in head - doc.Find("style").Remove() - - // Replace all br - replaceBrs(doc) - - // Replace font tags to span - doc.Find("font").Each(func(_ int, font *goquery.Selection) { - html, _ := font.Html() - font.ReplaceWithHtml("" + html + "") - }) -} - -// getArticleTitle fetchs the article title -func getArticleTitle(doc *goquery.Document) string { - // Get title tag - title := doc.Find("title").First().Text() - title = normalizeText(title) - originalTitle := title - - // Create list of separator - separators := []string{`|`, `-`, `\`, `/`, `>`, `»`} - hierarchialSeparators := []string{`\`, `/`, `>`, `»`} - - // If there's a separator in the title, first remove the final part - titleHadHierarchicalSeparators := false - if idx, sep := findSeparator(title, separators...); idx != -1 { - titleHadHierarchicalSeparators = hasSeparator(title, hierarchialSeparators...) - - index := strings.LastIndex(originalTitle, sep) - title = originalTitle[:index] - - // If the resulting title is too short (3 words or fewer), remove - // the first part instead: - if len(strings.Fields(title)) < 3 { - index = strings.Index(originalTitle, sep) - title = originalTitle[index+1:] - } - } else if strings.Contains(title, ": ") { - // Check if we have an heading containing this exact string, so we - // could assume it's the full title. - existInHeading := false - doc.Find("h1,h2").EachWithBreak(func(_ int, heading *goquery.Selection) bool { - headingText := strings.TrimSpace(heading.Text()) - if headingText == title { - existInHeading = true - return false - } - - return true - }) - - // If we don't, let's extract the title out of the original title string. - if !existInHeading { - index := strings.LastIndex(originalTitle, ":") - title = originalTitle[index+1:] - - // If the title is now too short, try the first colon instead: - if len(strings.Fields(title)) < 3 { - index = strings.Index(originalTitle, ":") - title = originalTitle[:index] - // But if we have too many words before the colon there's something weird - // with the titles and the H tags so let's just use the original title instead - } else { - index = strings.Index(originalTitle, ":") - beforeColon := originalTitle[:index] - if len(strings.Fields(beforeColon)) > 5 { - title = originalTitle - } - } - } - } else if strLen(title) > 150 || strLen(title) < 15 { - hOne := doc.Find("h1").First() - if hOne != nil { - title = hOne.Text() - } - } - - // If we now have 4 words or fewer as our title, and either no - // 'hierarchical' separators (\, /, > or ») were found in the original - // title or we decreased the number of words by more than 1 word, use - // the original title. - curTitleWordCount := len(strings.Fields(title)) - noSeparatorWordCount := len(strings.Fields(removeSeparator(originalTitle, separators...))) - if curTitleWordCount <= 4 && (!titleHadHierarchicalSeparators || curTitleWordCount != noSeparatorWordCount-1) { - title = originalTitle - } - - return normalizeText(title) -} - -// getArticleMetadata attempts to get excerpt and byline metadata for the article. -func getArticleMetadata(doc *goquery.Document) Metadata { - metadata := Metadata{} - mapAttribute := make(map[string]string) - - doc.Find("meta").Each(func(_ int, meta *goquery.Selection) { - metaName, _ := meta.Attr("name") - metaProperty, _ := meta.Attr("property") - metaContent, _ := meta.Attr("content") - - metaName = strings.TrimSpace(metaName) - metaProperty = strings.TrimSpace(metaProperty) - metaContent = strings.TrimSpace(metaContent) - - // Fetch author name - if strings.Contains(metaName+metaProperty, "author") { - metadata.Author = metaContent - return - } - - // Fetch description and title - if metaName == "title" || - metaName == "description" || - metaName == "twitter:title" || - metaName == "twitter:image" || - metaName == "twitter:description" { - if _, exist := mapAttribute[metaName]; !exist { - mapAttribute[metaName] = metaContent - } - return - } - - if metaProperty == "og:description" || - metaProperty == "og:image" || - metaProperty == "og:title" { - if _, exist := mapAttribute[metaProperty]; !exist { - mapAttribute[metaProperty] = metaContent - } - return - } - }) - - // Set final image - if _, exist := mapAttribute["og:image"]; exist { - metadata.Image = mapAttribute["og:image"] - } else if _, exist := mapAttribute["twitter:image"]; exist { - metadata.Image = mapAttribute["twitter:image"] - } - - if metadata.Image != "" && strings.HasPrefix(metadata.Image, "//") { - metadata.Image = "http:" + metadata.Image - } - - // Set final excerpt - if _, exist := mapAttribute["description"]; exist { - metadata.Excerpt = mapAttribute["description"] - } else if _, exist := mapAttribute["og:description"]; exist { - metadata.Excerpt = mapAttribute["og:description"] - } else if _, exist := mapAttribute["twitter:description"]; exist { - metadata.Excerpt = mapAttribute["twitter:description"] - } - - // Set final title - metadata.Title = getArticleTitle(doc) - if metadata.Title == "" { - if _, exist := mapAttribute["og:title"]; exist { - metadata.Title = mapAttribute["og:title"] - } else if _, exist := mapAttribute["twitter:title"]; exist { - metadata.Title = mapAttribute["twitter:title"] - } - } - - // Clean up the metadata - metadata.Title = normalizeText(metadata.Title) - metadata.Excerpt = normalizeText(metadata.Excerpt) - - return metadata -} - -// isValidByline checks whether the input string could be a byline. -// This verifies that the input is a string, and that the length -// is less than 100 chars. -func isValidByline(str string) bool { - return strLen(str) > 0 && strLen(str) < 100 -} - -func isElementWithoutContent(s *goquery.Selection) bool { - if s == nil { - return true - } - - html, _ := s.Html() - html = strings.TrimSpace(html) - return html == "" -} - -// hasSinglePInsideElement checks if this node has only whitespace and a single P element. -// Returns false if the DIV node contains non-empty text nodes -// or if it contains no P or more than 1 element. -func hasSinglePInsideElement(s *goquery.Selection) bool { - // There should be exactly 1 element child which is a P - return s.Children().Length() == 1 && s.Children().First().Is("p") -} - -// hasChildBlockElement determines whether element has any children -// block level elements. -func hasChildBlockElement(s *goquery.Selection) bool { - html, _ := s.Html() - return rxDivToPElements.MatchString(html) -} - -func setNodeTag(s *goquery.Selection, tag string) { - html, _ := s.Html() - newHTML := fmt.Sprintf("<%s>%s%s>", tag, html, tag) - s.ReplaceWithHtml(newHTML) -} - -func getNodeAncestors(node *goquery.Selection, maxDepth int) []*goquery.Selection { - ancestors := []*goquery.Selection{} - parent := node - - for i := 0; i < maxDepth; i++ { - parent = parent.Parent() - if len(parent.Nodes) == 0 { - return ancestors - } - - ancestors = append(ancestors, parent) - } - - return ancestors -} - -func hasAncestorTag(node *goquery.Selection, tag string, maxDepth int) (*goquery.Selection, bool) { - parent := node - - if maxDepth < 0 { - maxDepth = 100 - } - - for i := 0; i < maxDepth; i++ { - parent = parent.Parent() - if len(parent.Nodes) == 0 { - break - } - - if parent.Is(tag) { - return parent, true - } - } - - return nil, false -} - -// initializeNodeScore initializes a node and checks the className/id -// for special names to add to its score. -func initializeNodeScore(node *goquery.Selection) candidateItem { - contentScore := 0.0 - tagName := goquery.NodeName(node) - switch strings.ToLower(tagName) { - case "article": - contentScore += 10 - case "section": - contentScore += 8 - case "div": - contentScore += 5 - case "pre", "blockquote", "td": - contentScore += 3 - case "form", "ol", "ul", "dl", "dd", "dt", "li", "address": - contentScore -= 3 - case "th", "h1", "h2", "h3", "h4", "h5", "h6": - contentScore -= 5 - } - - contentScore += getClassWeight(node) - return candidateItem{contentScore, node} -} - -// getClassWeight gets an elements class/id weight. -// Uses regular expressions to tell if this element looks good or bad. -func getClassWeight(node *goquery.Selection) float64 { - weight := 0.0 - if str, b := node.Attr("class"); b { - if rxNegative.MatchString(str) { - weight -= 25 - } - - if rxPositive.MatchString(str) { - weight += 25 - } - } - - if str, b := node.Attr("id"); b { - if rxNegative.MatchString(str) { - weight -= 25 - } - - if rxPositive.MatchString(str) { - weight += 25 - } - } - - return weight -} - -// getLinkDensity gets the density of links as a percentage of the content -// This is the amount of text that is inside a link divided by the total text in the node. -func getLinkDensity(node *goquery.Selection) float64 { - textLength := strLen(normalizeText(node.Text())) - if textLength == 0 { - return 0 - } - - linkLength := 0 - node.Find("a").Each(func(_ int, link *goquery.Selection) { - linkLength += strLen(link.Text()) - }) - - return float64(linkLength) / float64(textLength) -} - -// Remove the style attribute on every e and under. -func cleanStyle(s *goquery.Selection) { - s.Find("*").Each(func(i int, s1 *goquery.Selection) { - tagName := goquery.NodeName(s1) - if strings.ToLower(tagName) == "svg" { - return - } - - s1.RemoveAttr("align") - s1.RemoveAttr("background") - s1.RemoveAttr("bgcolor") - s1.RemoveAttr("border") - s1.RemoveAttr("cellpadding") - s1.RemoveAttr("cellspacing") - s1.RemoveAttr("frame") - s1.RemoveAttr("hspace") - s1.RemoveAttr("rules") - s1.RemoveAttr("style") - s1.RemoveAttr("valign") - s1.RemoveAttr("vspace") - s1.RemoveAttr("onclick") - s1.RemoveAttr("onmouseover") - s1.RemoveAttr("border") - s1.RemoveAttr("style") - - if tagName != "table" && tagName != "th" && tagName != "td" && - tagName != "hr" && tagName != "pre" { - s1.RemoveAttr("width") - s1.RemoveAttr("height") - } - }) -} - -// Return an object indicating how many rows and columns this table has. -func getTableRowAndColumnCount(table *goquery.Selection) (int, int) { - rows := 0 - columns := 0 - table.Find("tr").Each(func(_ int, tr *goquery.Selection) { - // Look for rows - strRowSpan, _ := tr.Attr("rowspan") - rowSpan, err := strconv.Atoi(strRowSpan) - if err != nil { - rowSpan = 1 - } - rows += rowSpan - - // Now look for columns - columnInThisRow := 0 - tr.Find("td").Each(func(_ int, td *goquery.Selection) { - strColSpan, _ := tr.Attr("colspan") - colSpan, err := strconv.Atoi(strColSpan) - if err != nil { - colSpan = 1 - } - columnInThisRow += colSpan - }) - - if columnInThisRow > columns { - columns = columnInThisRow - } - }) - - return rows, columns -} - -// Look for 'data' (as opposed to 'layout') tables -func markDataTables(s *goquery.Selection) { - s.Find("table").Each(func(_ int, table *goquery.Selection) { - role, _ := table.Attr("role") - if role == "presentation" { - return - } - - datatable, _ := table.Attr("datatable") - if datatable == "0" { - return - } - - _, summaryExist := table.Attr("summary") - if summaryExist { - table.SetAttr(dataTableAttr, "1") - return - } - - caption := table.Find("caption").First() - if len(caption.Nodes) > 0 && caption.Children().Length() > 0 { - table.SetAttr(dataTableAttr, "1") - return - } - - // If the table has a descendant with any of these tags, consider a data table: - dataTableDescendants := []string{"col", "colgroup", "tfoot", "thead", "th"} - for _, tag := range dataTableDescendants { - if table.Find(tag).Length() > 0 { - table.SetAttr(dataTableAttr, "1") - return - } - } - - // Nested tables indicate a layout table: - if table.Find("table").Length() > 0 { - return - } - - nRow, nColumn := getTableRowAndColumnCount(table) - if nRow >= 10 || nColumn > 4 { - table.SetAttr(dataTableAttr, "1") - return - } - - // Now just go by size entirely: - if nRow*nColumn > 10 { - table.SetAttr(dataTableAttr, "1") - return - } - }) -} - -// Clean an element of all tags of type "tag" if they look fishy. -// "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. -func cleanConditionally(e *goquery.Selection, tag string) { - isList := tag == "ul" || tag == "ol" - - e.Find(tag).Each(func(i int, node *goquery.Selection) { - // First check if we're in a data table, in which case don't remove it - if ancestor, hasTag := hasAncestorTag(node, "table", -1); hasTag { - if attr, _ := ancestor.Attr(dataTableAttr); attr == "1" { - return - } - } - - // If it is table, remove data table marker - if tag == "table" { - node.RemoveAttr(dataTableAttr) - } - - contentScore := 0.0 - weight := getClassWeight(node) - if weight+contentScore < 0 { - node.Remove() - return - } - - // If there are not very many commas, and the number of - // non-paragraph elements is more than paragraphs or other - // ominous signs, remove the element. - nodeText := normalizeText(node.Text()) - nCommas := strings.Count(nodeText, ",") - nCommas += strings.Count(nodeText, ",") - if nCommas < 10 { - p := node.Find("p").Length() - img := node.Find("img").Length() - li := node.Find("li").Length() - 100 - input := node.Find("input").Length() - - embedCount := 0 - node.Find("embed").Each(func(i int, embed *goquery.Selection) { - if !rxVideos.MatchString(embed.AttrOr("src", "")) { - embedCount++ - } - }) - - contentLength := strLen(nodeText) - linkDensity := getLinkDensity(node) - _, hasFigureAncestor := hasAncestorTag(node, "figure", 3) - - haveToRemove := (!isList && li > p) || - (img > 1 && float64(p)/float64(img) < 0.5 && !hasFigureAncestor) || - (float64(input) > math.Floor(float64(p)/3)) || - (!isList && contentLength < 25 && (img == 0 || img > 2) && !hasFigureAncestor) || - (!isList && weight < 25 && linkDensity > 0.2) || - (weight >= 25 && linkDensity > 0.5) || - ((embedCount == 1 && contentLength < 75) || embedCount > 1) - - if haveToRemove { - node.Remove() - } - } - }) -} - -// Clean a node of all elements of type "tag". -// (Unless it's a youtube/vimeo video. People love movies.) -func clean(s *goquery.Selection, tag string) { - isEmbed := tag == "object" || tag == "embed" || tag == "iframe" - - s.Find(tag).Each(func(i int, target *goquery.Selection) { - attributeValues := "" - for _, attribute := range target.Nodes[0].Attr { - attributeValues += " " + attribute.Val - } - - if isEmbed && rxVideos.MatchString(attributeValues) { - return - } - - if isEmbed && rxVideos.MatchString(target.Text()) { - return - } - - target.Remove() - }) -} - -// Clean out spurious headers from an Element. Checks things like classnames and link density. -func cleanHeaders(s *goquery.Selection) { - s.Find("h1,h2,h3").Each(func(_ int, s1 *goquery.Selection) { - if getClassWeight(s1) < 0 { - s1.Remove() - } - }) -} - -// Prepare the article node for display. Clean out any inline styles, -// iframes, forms, strip extraneous
tags, etc. -func prepArticle(articleContent *goquery.Selection, articleTitle string) { - if articleContent == nil { - return - } - - // Check for data tables before we continue, to avoid removing items in - // those tables, which will often be isolated even though they're - // visually linked to other content-ful elements (text, images, etc.). - markDataTables(articleContent) - - // Remove style attribute - cleanStyle(articleContent) - - // Clean out junk from the article content - cleanConditionally(articleContent, "form") - cleanConditionally(articleContent, "fieldset") - clean(articleContent, "h1") - clean(articleContent, "object") - clean(articleContent, "embed") - clean(articleContent, "footer") - clean(articleContent, "link") - - // Clean out elements have "share" in their id/class combinations from final top candidates, - // which means we don't remove the top candidates even they have "share". - articleContent.Find("*").Each(func(_ int, s *goquery.Selection) { - id, _ := s.Attr("id") - class, _ := s.Attr("class") - matchString := class + " " + id - if strings.Contains(matchString, "share") { - s.Remove() - } - }) - - // If there is only one h2 and its text content substantially equals article title, - // they are probably using it as a header and not a subheader, - // so remove it since we already extract the title separately. - h2s := articleContent.Find("h2") - if h2s.Length() == 1 { - h2 := h2s.First() - h2Text := normalizeText(h2.Text()) - lengthSimilarRate := float64(strLen(h2Text)-strLen(articleTitle)) / - float64(strLen(articleTitle)) - - if math.Abs(lengthSimilarRate) < 0.5 { - titlesMatch := false - if lengthSimilarRate > 0 { - titlesMatch = strings.Contains(h2Text, articleTitle) - } else { - titlesMatch = strings.Contains(articleTitle, h2Text) - } - - if titlesMatch { - h2.Remove() - } - } - } - - clean(articleContent, "iframe") - clean(articleContent, "input") - clean(articleContent, "textarea") - clean(articleContent, "select") - clean(articleContent, "button") - cleanHeaders(articleContent) - - // Do these last as the previous stuff may have removed junk - // that will affect these - cleanConditionally(articleContent, "table") - cleanConditionally(articleContent, "ul") - cleanConditionally(articleContent, "div") - - // Remove extra paragraphs - // At this point, nasty iframes have been removed, only remain embedded video ones. - articleContent.Find("p").Each(func(_ int, p *goquery.Selection) { - imgCount := p.Find("img").Length() - embedCount := p.Find("embed").Length() - objectCount := p.Find("object").Length() - iframeCount := p.Find("iframe").Length() - totalCount := imgCount + embedCount + objectCount + iframeCount - - pText := normalizeText(p.Text()) - if totalCount == 0 && strLen(pText) == 0 { - p.Remove() - } - }) - - articleContent.Find("br").Each(func(_ int, br *goquery.Selection) { - if br.Next().Is("p") { - br.Remove() - } - }) -} - -// grabArticle fetch the articles using a variety of metrics (content score, classname, element types), -// find the content that is most likely to be the stuff a user wants to read. -// Then return it wrapped up in a div. -func grabArticle(doc *goquery.Document, articleTitle string) (*goquery.Selection, string) { - // Create initial variable - author := "" - elementsToScore := []*goquery.Selection{} - - // First, node prepping. Trash nodes that look cruddy (like ones with the - // class name "comment", etc), and turn divs into P tags where they have been - // used inappropriately (as in, where they contain no other block level elements.) - doc.Find("*").Each(func(i int, s *goquery.Selection) { - matchString := s.AttrOr("class", "") + " " + s.AttrOr("id", "") - - // If byline, remove this element - if rel := s.AttrOr("rel", ""); rel == "author" || rxByline.MatchString(matchString) { - text := s.Text() - text = strings.TrimSpace(text) - if isValidByline(text) { - author = text - s.Remove() - return - } - } - - // Remove unlikely candidates - if rxUnlikelyCandidates.MatchString(matchString) && - !rxOkMaybeItsACandidate.MatchString(matchString) && - !s.Is("body") && !s.Is("a") { - s.Remove() - return - } - - if rxUnlikelyElements.MatchString(goquery.NodeName(s)) { - s.Remove() - return - } - - // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). - if s.Is("div,section,header,h1,h2,h3,h4,h5,h6") && isElementWithoutContent(s) { - s.Remove() - return - } - - if s.Is("section,h2,h3,h4,h5,h6,p,td,pre") { - elementsToScore = append(elementsToScore, s) - } - - // Turn all divs that don't have children block level elements into p's - if s.Is("div") { - // Sites like http://mobile.slate.com encloses each paragraph with a DIV - // element. DIVs with only a P element inside and no text content can be - // safely converted into plain P elements to avoid confusing the scoring - // algorithm with DIVs with are, in practice, paragraphs. - if hasSinglePInsideElement(s) { - newNode := s.Children().First() - s.ReplaceWithSelection(newNode) - elementsToScore = append(elementsToScore, s) - } else if !hasChildBlockElement(s) { - setNodeTag(s, "p") - elementsToScore = append(elementsToScore, s) - } - } - }) - - // Loop through all paragraphs, and assign a score to them based on how content-y they look. - // Then add their score to their parent node. - // A score is determined by things like number of commas, class names, etc. Maybe eventually link density. - candidates := make(map[string]candidateItem) - for _, s := range elementsToScore { - // If this paragraph is less than 25 characters, don't even count it. - innerText := normalizeText(s.Text()) - if strLen(innerText) < 25 { - continue - } - - // Exclude nodes with no ancestor. - ancestors := getNodeAncestors(s, 3) - if len(ancestors) == 0 { - continue - } - - // Calculate content score - // Add a point for the paragraph itself as a base. - contentScore := 1.0 - - // Add points for any commas within this paragraph. - contentScore += float64(strings.Count(innerText, ",")) - contentScore += float64(strings.Count(innerText, ",")) - - // For every 100 characters in this paragraph, add another point. Up to 3 points. - contentScore += math.Min(math.Floor(float64(strLen(innerText)/100)), 3) - - // Initialize and score ancestors. - for level, ancestor := range ancestors { - // Node score divider: - // - parent: 1 (no division) - // - grandparent: 2 - // - great grandparent+: ancestor level * 3 - scoreDivider := 0 - if level == 0 { - scoreDivider = 1 - } else if level == 1 { - scoreDivider = 2 - } else { - scoreDivider = level * 3 - } - - ancestorHash := hashNode(ancestor) - if _, ok := candidates[ancestorHash]; !ok { - candidates[ancestorHash] = initializeNodeScore(ancestor) - } - - candidate := candidates[ancestorHash] - candidate.score += contentScore / float64(scoreDivider) - candidates[ancestorHash] = candidate - } - } - - // Scale the final candidates score based on link density. Good content - // should have a relatively small link density (5% or less) and be mostly - // unaffected by this operation. - topCandidate := candidateItem{} - for hash, candidate := range candidates { - candidate.score = candidate.score * (1 - getLinkDensity(candidate.node)) - candidates[hash] = candidate - - if topCandidate.node == nil || candidate.score > topCandidate.score { - topCandidate = candidate - } - } - - // If we still have no top candidate, use the body as a last resort. - if topCandidate.node == nil { - body := doc.Find("body").First() - - bodyHTML, _ := body.Html() - newHTML := fmt.Sprintf(`