100 lines
2.3 KiB
Go
100 lines
2.3 KiB
Go
package content
|
|
|
|
import (
|
|
"log"
|
|
"strings"
|
|
"time"
|
|
"unicode"
|
|
|
|
"github.com/tardisx/linkwallet/entity"
|
|
|
|
"github.com/gocolly/colly"
|
|
snowballeng "github.com/kljensen/snowball/english"
|
|
)
|
|
|
|
func FetchPageInfo(bm entity.Bookmark) entity.PageInfo {
|
|
info := entity.PageInfo{
|
|
Fetched: time.Now(),
|
|
}
|
|
|
|
url := bm.URL
|
|
|
|
c := colly.NewCollector()
|
|
c.SetRequestTimeout(5 * time.Second)
|
|
|
|
c.OnHTML("p,h1,h2,h3,h4,h5,h6,li", func(e *colly.HTMLElement) {
|
|
info.RawText = info.RawText + e.Text + "\n"
|
|
})
|
|
|
|
c.OnHTML("head>title", func(h *colly.HTMLElement) {
|
|
info.Title = h.Text
|
|
})
|
|
|
|
c.OnResponse(func(r *colly.Response) {
|
|
info.StatusCode = r.StatusCode
|
|
info.Size = len(r.Body)
|
|
// log.Printf("content type for %s: %s (%d)", r.Request.URL.String(), r.Headers.Get("Content-Type"), info.Size)
|
|
|
|
})
|
|
|
|
// Before making a request print "Visiting ..."
|
|
c.OnRequest(func(r *colly.Request) {
|
|
// log.Println("Visiting", r.URL.String())
|
|
})
|
|
|
|
c.OnError(func(r *colly.Response, err error) {
|
|
log.Printf("error for %s: %s", r.Request.URL.String(), err)
|
|
})
|
|
|
|
c.Visit(url)
|
|
return info
|
|
}
|
|
|
|
func Words(bm *entity.Bookmark) []string {
|
|
words := []string{}
|
|
|
|
words = append(words, StringToStemmedSearchWords(bm.Info.RawText)...)
|
|
words = append(words, StringToStemmedSearchWords(bm.Info.Title)...)
|
|
words = append(words, StringToStemmedSearchWords(bm.URL)...)
|
|
return words
|
|
}
|
|
|
|
// StringToStemmedSearchWords returns a list of stemmed words with stop words
|
|
// removed.
|
|
func StringToStemmedSearchWords(s string) []string {
|
|
words := []string{}
|
|
|
|
words = append(words, stemmerFilter(stopwordFilter(tokenize(s)))...)
|
|
return words
|
|
}
|
|
|
|
func tokenize(text string) []string {
|
|
return strings.FieldsFunc(text, func(r rune) bool {
|
|
// Split on any character that is not a letter or a number.
|
|
return !unicode.IsLetter(r) && !unicode.IsNumber(r)
|
|
})
|
|
}
|
|
|
|
func stemmerFilter(tokens []string) []string {
|
|
r := make([]string, len(tokens))
|
|
for i, token := range tokens {
|
|
r[i] = snowballeng.Stem(token, false)
|
|
}
|
|
return r
|
|
}
|
|
|
|
var stopwords = map[string]struct{}{ // I wish Go had built-in sets.
|
|
"a": {}, "and": {}, "be": {}, "have": {}, "i": {},
|
|
"in": {}, "of": {}, "that": {}, "the": {}, "to": {},
|
|
}
|
|
|
|
func stopwordFilter(tokens []string) []string {
|
|
r := make([]string, 0, len(tokens))
|
|
for _, token := range tokens {
|
|
if _, ok := stopwords[token]; !ok {
|
|
r = append(r, token)
|
|
}
|
|
}
|
|
return r
|
|
}
|