Merge the feature branch to introduce the bleve indexing.

commit e8c2bc7e4a
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Sat May 3 12:36:18 2025 +0930

    Clean up menu/version

commit 1993533a46
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Sat May 3 12:31:50 2025 +0930

    Update README

commit 044cc830dc
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Sat May 3 12:22:33 2025 +0930

    No longer needed

commit a7c37ad7c5
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Sat May 3 12:14:59 2025 +0930

    Fixup version handling

commit ade0b748e9
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Sat May 3 11:58:15 2025 +0930

    Use the correct analyser for searches

commit e5a65cf5cf
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Fri May 2 19:51:54 2025 +0930

    Fix version in template

commit 0171be0ee4
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Fri May 2 19:51:41 2025 +0930

    Rescrape all links if needed on startup

commit ae654998f7
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Fri May 2 19:51:25 2025 +0930

    Spelling

commit bfe9bbee02
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Fri May 2 19:34:06 2025 +0930

    Make goreleaser set the version

commit 4436313413
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Fri May 2 11:43:58 2025 +0930

    Make release matrix sane

commit 7b467ecee7
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Fri May 2 11:40:07 2025 +0930

    I hate YAML, so much.

commit b578e0f044
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Thu May 1 23:47:07 2025 +0930

    Update goreleaser

commit fba84f0827
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Thu May 1 23:45:46 2025 +0930

    Update version

commit e4edb08bd1
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Thu May 1 23:42:59 2025 +0930

    Deps

commit 58b6692d1b
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Thu May 1 23:39:51 2025 +0930

    Mostly done, first cut

commit badbe5e92f
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Sun Apr 27 20:28:37 2025 +0930

    Remove unused code

commit 903240dd18
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Sun Apr 27 20:26:19 2025 +0930

    Update deps

commit de90b9951a
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Sun Apr 27 20:21:33 2025 +0930

    Keep on bleving

commit 9b15528510
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Fri Apr 25 23:57:04 2025 +0930

    Start of blevification
This commit is contained in:
2025-05-03 12:37:44 +09:30
parent 3a5fc1d66d
commit d2aa6fdd2f
24 changed files with 797 additions and 670 deletions

View File

@@ -2,14 +2,11 @@ package content
import (
"log"
"strings"
"time"
"unicode"
"github.com/tardisx/linkwallet/entity"
"github.com/gocolly/colly"
snowballeng "github.com/kljensen/snowball/english"
)
func FetchPageInfo(bm entity.Bookmark) entity.PageInfo {
@@ -33,11 +30,8 @@ func FetchPageInfo(bm entity.Bookmark) entity.PageInfo {
c.OnResponse(func(r *colly.Response) {
info.StatusCode = r.StatusCode
info.Size = len(r.Body)
// log.Printf("content type for %s: %s (%d)", r.Request.URL.String(), r.Headers.Get("Content-Type"), info.Size)
})
// Before making a request print "Visiting ..."
c.OnRequest(func(r *colly.Request) {
// log.Println("Visiting", r.URL.String())
})
@@ -49,51 +43,3 @@ func FetchPageInfo(bm entity.Bookmark) entity.PageInfo {
c.Visit(url)
return info
}
func Words(bm *entity.Bookmark) []string {
words := []string{}
words = append(words, StringToStemmedSearchWords(bm.Info.RawText)...)
words = append(words, StringToStemmedSearchWords(bm.Info.Title)...)
words = append(words, StringToStemmedSearchWords(bm.URL)...)
return words
}
// StringToStemmedSearchWords returns a list of stemmed words with stop words
// removed.
func StringToStemmedSearchWords(s string) []string {
words := []string{}
words = append(words, stemmerFilter(stopwordFilter(tokenize(s)))...)
return words
}
func tokenize(text string) []string {
return strings.FieldsFunc(text, func(r rune) bool {
// Split on any character that is not a letter or a number.
return !unicode.IsLetter(r) && !unicode.IsNumber(r)
})
}
func stemmerFilter(tokens []string) []string {
r := make([]string, len(tokens))
for i, token := range tokens {
r[i] = snowballeng.Stem(token, false)
}
return r
}
var stopwords = map[string]struct{}{ // I wish Go had built-in sets.
"a": {}, "and": {}, "be": {}, "have": {}, "i": {},
"in": {}, "of": {}, "that": {}, "the": {}, "to": {},
}
func stopwordFilter(tokens []string) []string {
r := make([]string, 0, len(tokens))
for _, token := range tokens {
if _, ok := stopwords[token]; !ok {
r = append(r, token)
}
}
return r
}

View File

@@ -47,45 +47,45 @@ func TestSimpleScrape(t *testing.T) {
}
}
func TestWords(t *testing.T) {
// func TestWords(t *testing.T) {
bm := entity.Bookmark{
// ID: 0,
// URL: "",
Info: entity.PageInfo{RawText: "the quick brown fox jumped over the lazy dog"},
// Tags: []string{},
}
words := Words(&bm)
if len(words) != 7 {
t.Errorf("got %d words not 7", len(words))
} else {
if words[0] != "quick" ||
words[1] != "brown" ||
words[2] != "fox" ||
words[3] != "jump" ||
words[4] != "over" ||
words[5] != "lazi" ||
words[6] != "dog" {
t.Error("incorrect words returned")
}
}
}
// bm := entity.Bookmark{
// // ID: 0,
// // URL: "",
// Info: entity.PageInfo{RawText: "the quick brown fox jumped over the lazy dog"},
// // Tags: []string{},
// }
// words := Words(&bm)
// if len(words) != 7 {
// t.Errorf("got %d words not 7", len(words))
// } else {
// if words[0] != "quick" ||
// words[1] != "brown" ||
// words[2] != "fox" ||
// words[3] != "jump" ||
// words[4] != "over" ||
// words[5] != "lazi" ||
// words[6] != "dog" {
// t.Error("incorrect words returned")
// }
// }
// }
func TestStemmer(t *testing.T) {
s := `quick quick fox 😂 smile http://google.com`
words1 := StringToStemmedSearchWords(s)
t.Log(words1)
if len(words1) != 7 {
t.Error("wrong number of words")
}
if words1[0] != "quick" ||
words1[1] != "quick" ||
words1[2] != "fox" ||
words1[3] != "smile" ||
words1[4] != "http" ||
words1[5] != "googl" ||
words1[6] != "com" {
t.Error("bad words")
}
// func TestStemmer(t *testing.T) {
// s := `quick quick fox 😂 smile http://google.com`
// words1 := StringToStemmedSearchWords(s)
// t.Log(words1)
// if len(words1) != 7 {
// t.Error("wrong number of words")
// }
// if words1[0] != "quick" ||
// words1[1] != "quick" ||
// words1[2] != "fox" ||
// words1[3] != "smile" ||
// words1[4] != "http" ||
// words1[5] != "googl" ||
// words1[6] != "com" {
// t.Error("bad words")
// }
}
// }