Merge the feature branch to introduce the bleve indexing.
commite8c2bc7e4aAuthor: Justin Hawkins <justin@hawkins.id.au> Date: Sat May 3 12:36:18 2025 +0930 Clean up menu/version commit1993533a46Author: Justin Hawkins <justin@hawkins.id.au> Date: Sat May 3 12:31:50 2025 +0930 Update README commit044cc830dcAuthor: Justin Hawkins <justin@hawkins.id.au> Date: Sat May 3 12:22:33 2025 +0930 No longer needed commita7c37ad7c5Author: Justin Hawkins <justin@hawkins.id.au> Date: Sat May 3 12:14:59 2025 +0930 Fixup version handling commitade0b748e9Author: Justin Hawkins <justin@hawkins.id.au> Date: Sat May 3 11:58:15 2025 +0930 Use the correct analyser for searches commite5a65cf5cfAuthor: Justin Hawkins <justin@hawkins.id.au> Date: Fri May 2 19:51:54 2025 +0930 Fix version in template commit0171be0ee4Author: Justin Hawkins <justin@hawkins.id.au> Date: Fri May 2 19:51:41 2025 +0930 Rescrape all links if needed on startup commitae654998f7Author: Justin Hawkins <justin@hawkins.id.au> Date: Fri May 2 19:51:25 2025 +0930 Spelling commitbfe9bbee02Author: Justin Hawkins <justin@hawkins.id.au> Date: Fri May 2 19:34:06 2025 +0930 Make goreleaser set the version commit4436313413Author: Justin Hawkins <justin@hawkins.id.au> Date: Fri May 2 11:43:58 2025 +0930 Make release matrix sane commit7b467ecee7Author: Justin Hawkins <justin@hawkins.id.au> Date: Fri May 2 11:40:07 2025 +0930 I hate YAML, so much. commitb578e0f044Author: Justin Hawkins <justin@hawkins.id.au> Date: Thu May 1 23:47:07 2025 +0930 Update goreleaser commitfba84f0827Author: Justin Hawkins <justin@hawkins.id.au> Date: Thu May 1 23:45:46 2025 +0930 Update version commite4edb08bd1Author: Justin Hawkins <justin@hawkins.id.au> Date: Thu May 1 23:42:59 2025 +0930 Deps commit58b6692d1bAuthor: Justin Hawkins <justin@hawkins.id.au> Date: Thu May 1 23:39:51 2025 +0930 Mostly done, first cut commitbadbe5e92fAuthor: Justin Hawkins <justin@hawkins.id.au> Date: Sun Apr 27 20:28:37 2025 +0930 Remove unused code commit903240dd18Author: Justin Hawkins <justin@hawkins.id.au> Date: Sun Apr 27 20:26:19 2025 +0930 Update deps commitde90b9951aAuthor: Justin Hawkins <justin@hawkins.id.au> Date: Sun Apr 27 20:21:33 2025 +0930 Keep on bleving commit9b15528510Author: Justin Hawkins <justin@hawkins.id.au> Date: Fri Apr 25 23:57:04 2025 +0930 Start of blevification
This commit is contained in:
@@ -2,14 +2,11 @@ package content
|
||||
|
||||
import (
|
||||
"log"
|
||||
"strings"
|
||||
"time"
|
||||
"unicode"
|
||||
|
||||
"github.com/tardisx/linkwallet/entity"
|
||||
|
||||
"github.com/gocolly/colly"
|
||||
snowballeng "github.com/kljensen/snowball/english"
|
||||
)
|
||||
|
||||
func FetchPageInfo(bm entity.Bookmark) entity.PageInfo {
|
||||
@@ -33,11 +30,8 @@ func FetchPageInfo(bm entity.Bookmark) entity.PageInfo {
|
||||
c.OnResponse(func(r *colly.Response) {
|
||||
info.StatusCode = r.StatusCode
|
||||
info.Size = len(r.Body)
|
||||
// log.Printf("content type for %s: %s (%d)", r.Request.URL.String(), r.Headers.Get("Content-Type"), info.Size)
|
||||
|
||||
})
|
||||
|
||||
// Before making a request print "Visiting ..."
|
||||
c.OnRequest(func(r *colly.Request) {
|
||||
// log.Println("Visiting", r.URL.String())
|
||||
})
|
||||
@@ -49,51 +43,3 @@ func FetchPageInfo(bm entity.Bookmark) entity.PageInfo {
|
||||
c.Visit(url)
|
||||
return info
|
||||
}
|
||||
|
||||
func Words(bm *entity.Bookmark) []string {
|
||||
words := []string{}
|
||||
|
||||
words = append(words, StringToStemmedSearchWords(bm.Info.RawText)...)
|
||||
words = append(words, StringToStemmedSearchWords(bm.Info.Title)...)
|
||||
words = append(words, StringToStemmedSearchWords(bm.URL)...)
|
||||
return words
|
||||
}
|
||||
|
||||
// StringToStemmedSearchWords returns a list of stemmed words with stop words
|
||||
// removed.
|
||||
func StringToStemmedSearchWords(s string) []string {
|
||||
words := []string{}
|
||||
|
||||
words = append(words, stemmerFilter(stopwordFilter(tokenize(s)))...)
|
||||
return words
|
||||
}
|
||||
|
||||
func tokenize(text string) []string {
|
||||
return strings.FieldsFunc(text, func(r rune) bool {
|
||||
// Split on any character that is not a letter or a number.
|
||||
return !unicode.IsLetter(r) && !unicode.IsNumber(r)
|
||||
})
|
||||
}
|
||||
|
||||
func stemmerFilter(tokens []string) []string {
|
||||
r := make([]string, len(tokens))
|
||||
for i, token := range tokens {
|
||||
r[i] = snowballeng.Stem(token, false)
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
var stopwords = map[string]struct{}{ // I wish Go had built-in sets.
|
||||
"a": {}, "and": {}, "be": {}, "have": {}, "i": {},
|
||||
"in": {}, "of": {}, "that": {}, "the": {}, "to": {},
|
||||
}
|
||||
|
||||
func stopwordFilter(tokens []string) []string {
|
||||
r := make([]string, 0, len(tokens))
|
||||
for _, token := range tokens {
|
||||
if _, ok := stopwords[token]; !ok {
|
||||
r = append(r, token)
|
||||
}
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
@@ -47,45 +47,45 @@ func TestSimpleScrape(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestWords(t *testing.T) {
|
||||
// func TestWords(t *testing.T) {
|
||||
|
||||
bm := entity.Bookmark{
|
||||
// ID: 0,
|
||||
// URL: "",
|
||||
Info: entity.PageInfo{RawText: "the quick brown fox jumped over the lazy dog"},
|
||||
// Tags: []string{},
|
||||
}
|
||||
words := Words(&bm)
|
||||
if len(words) != 7 {
|
||||
t.Errorf("got %d words not 7", len(words))
|
||||
} else {
|
||||
if words[0] != "quick" ||
|
||||
words[1] != "brown" ||
|
||||
words[2] != "fox" ||
|
||||
words[3] != "jump" ||
|
||||
words[4] != "over" ||
|
||||
words[5] != "lazi" ||
|
||||
words[6] != "dog" {
|
||||
t.Error("incorrect words returned")
|
||||
}
|
||||
}
|
||||
}
|
||||
// bm := entity.Bookmark{
|
||||
// // ID: 0,
|
||||
// // URL: "",
|
||||
// Info: entity.PageInfo{RawText: "the quick brown fox jumped over the lazy dog"},
|
||||
// // Tags: []string{},
|
||||
// }
|
||||
// words := Words(&bm)
|
||||
// if len(words) != 7 {
|
||||
// t.Errorf("got %d words not 7", len(words))
|
||||
// } else {
|
||||
// if words[0] != "quick" ||
|
||||
// words[1] != "brown" ||
|
||||
// words[2] != "fox" ||
|
||||
// words[3] != "jump" ||
|
||||
// words[4] != "over" ||
|
||||
// words[5] != "lazi" ||
|
||||
// words[6] != "dog" {
|
||||
// t.Error("incorrect words returned")
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
func TestStemmer(t *testing.T) {
|
||||
s := `quick quick fox 😂 smile http://google.com`
|
||||
words1 := StringToStemmedSearchWords(s)
|
||||
t.Log(words1)
|
||||
if len(words1) != 7 {
|
||||
t.Error("wrong number of words")
|
||||
}
|
||||
if words1[0] != "quick" ||
|
||||
words1[1] != "quick" ||
|
||||
words1[2] != "fox" ||
|
||||
words1[3] != "smile" ||
|
||||
words1[4] != "http" ||
|
||||
words1[5] != "googl" ||
|
||||
words1[6] != "com" {
|
||||
t.Error("bad words")
|
||||
}
|
||||
// func TestStemmer(t *testing.T) {
|
||||
// s := `quick quick fox 😂 smile http://google.com`
|
||||
// words1 := StringToStemmedSearchWords(s)
|
||||
// t.Log(words1)
|
||||
// if len(words1) != 7 {
|
||||
// t.Error("wrong number of words")
|
||||
// }
|
||||
// if words1[0] != "quick" ||
|
||||
// words1[1] != "quick" ||
|
||||
// words1[2] != "fox" ||
|
||||
// words1[3] != "smile" ||
|
||||
// words1[4] != "http" ||
|
||||
// words1[5] != "googl" ||
|
||||
// words1[6] != "com" {
|
||||
// t.Error("bad words")
|
||||
// }
|
||||
|
||||
}
|
||||
// }
|
||||
|
||||
Reference in New Issue
Block a user