Merge the feature branch to introduce the bleve indexing.

commit e8c2bc7e4a
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Sat May 3 12:36:18 2025 +0930

    Clean up menu/version

commit 1993533a46
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Sat May 3 12:31:50 2025 +0930

    Update README

commit 044cc830dc
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Sat May 3 12:22:33 2025 +0930

    No longer needed

commit a7c37ad7c5
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Sat May 3 12:14:59 2025 +0930

    Fixup version handling

commit ade0b748e9
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Sat May 3 11:58:15 2025 +0930

    Use the correct analyser for searches

commit e5a65cf5cf
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Fri May 2 19:51:54 2025 +0930

    Fix version in template

commit 0171be0ee4
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Fri May 2 19:51:41 2025 +0930

    Rescrape all links if needed on startup

commit ae654998f7
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Fri May 2 19:51:25 2025 +0930

    Spelling

commit bfe9bbee02
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Fri May 2 19:34:06 2025 +0930

    Make goreleaser set the version

commit 4436313413
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Fri May 2 11:43:58 2025 +0930

    Make release matrix sane

commit 7b467ecee7
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Fri May 2 11:40:07 2025 +0930

    I hate YAML, so much.

commit b578e0f044
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Thu May 1 23:47:07 2025 +0930

    Update goreleaser

commit fba84f0827
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Thu May 1 23:45:46 2025 +0930

    Update version

commit e4edb08bd1
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Thu May 1 23:42:59 2025 +0930

    Deps

commit 58b6692d1b
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Thu May 1 23:39:51 2025 +0930

    Mostly done, first cut

commit badbe5e92f
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Sun Apr 27 20:28:37 2025 +0930

    Remove unused code

commit 903240dd18
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Sun Apr 27 20:26:19 2025 +0930

    Update deps

commit de90b9951a
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Sun Apr 27 20:21:33 2025 +0930

    Keep on bleving

commit 9b15528510
Author: Justin Hawkins <justin@hawkins.id.au>
Date:   Fri Apr 25 23:57:04 2025 +0930

    Start of blevification
This commit is contained in:
2025-05-03 12:37:44 +09:30
parent 3a5fc1d66d
commit d2aa6fdd2f
24 changed files with 797 additions and 670 deletions

View File

@@ -3,13 +3,19 @@ package db
import (
"errors"
"fmt"
"html/template"
"io"
"log"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
"time"
"github.com/blevesearch/bleve/v2"
"github.com/blevesearch/bleve/v2/analysis/lang/en"
"github.com/blevesearch/bleve/v2/search/query"
"github.com/tardisx/linkwallet/content"
"github.com/tardisx/linkwallet/entity"
@@ -22,9 +28,9 @@ type BookmarkManager struct {
}
type SearchOptions struct {
Query string
Tags []string
Sort string
All bool
Query string
Results int
}
func NewBookmarkManager(db *DB) *BookmarkManager {
@@ -63,19 +69,19 @@ func (m *BookmarkManager) DeleteBookmark(bm *entity.Bookmark) error {
// delete it
m.db.store.DeleteMatching(bm, bolthold.Where("ID").Eq(bm.ID))
// delete all the index entries
m.db.UpdateIndexForWordsByID([]string{}, bm.ID)
return nil
return m.db.bleve.Delete(fmt.Sprint(bm.ID))
}
// ListBookmarks returns all bookmarks.
func (m *BookmarkManager) ListBookmarks() ([]entity.Bookmark, error) {
bookmarks := make([]entity.Bookmark, 0)
err := m.db.store.Find(&bookmarks, &bolthold.Query{})
if err != nil {
panic(err)
}
return bookmarks, nil
}
// func (m *BookmarkManager) ListBookmarks() ([]entity.Bookmark, error) {
// bookmarks := make([]entity.Bookmark, 0)
// err := m.db.store.Find(&bookmarks, &bolthold.Query{})
// if err != nil {
// panic(err)
// }
// log.Printf("found %d bookmarks", len(bookmarks))
// return bookmarks, nil
// }
// ExportBookmarks exports all bookmarks to an io.Writer
func (m *BookmarkManager) ExportBookmarks(w io.Writer) error {
@@ -108,80 +114,53 @@ func (m *BookmarkManager) LoadBookmarkByID(id uint64) entity.Bookmark {
return ret
}
func (m *BookmarkManager) Search(opts SearchOptions) ([]entity.Bookmark, error) {
// first get a list of all the ids that match our query
idsMatchingQuery := make([]uint64, 0, 0)
counts := make(map[uint64]uint8)
words := content.StringToStemmedSearchWords(opts.Query)
for _, word := range words {
var wi *entity.WordIndex
err := m.db.store.Get("word_index_"+word, &wi)
if err == bolthold.ErrNotFound {
continue
}
if err != nil {
return nil, fmt.Errorf("error retrieving index: %w", err)
}
for k := range wi.Bitmap {
counts[k]++
}
func (m *BookmarkManager) Search(opts SearchOptions) ([]entity.BookmarkSearchResult, error) {
found := []entity.BookmarkSearchResult{}
if opts.All && opts.Query != "" {
panic("can't fetch all with query")
}
for k, v := range counts {
if v == uint8(len(words)) {
idsMatchingQuery = append(idsMatchingQuery, k)
if len(idsMatchingQuery) > 10 {
break
}
}
}
var q query.Query
// now we can do our search
bhQuery := bolthold.Query{}
if opts.Query != "" {
bhQuery = bolthold.Query(*bhQuery.And("ID").In(bolthold.Slice(idsMatchingQuery)...))
}
if opts.Tags != nil && len(opts.Tags) > 0 {
bhQuery = bolthold.Query(*bhQuery.And("Tags").ContainsAll(bolthold.Slice(opts.Tags)...))
}
reverse := false
sortOrder := opts.Sort
if sortOrder != "" && sortOrder[0] == '-' {
reverse = true
sortOrder = sortOrder[1:]
}
if sortOrder == "title" {
bhQuery.SortBy("Info.Title")
} else if sortOrder == "created" {
bhQuery.SortBy("TimestampCreated")
} else if sortOrder == "scraped" {
bhQuery.SortBy("TimestampLastScraped")
if opts.All {
q = bleve.NewMatchAllQuery()
} else {
bhQuery.SortBy("ID")
mq := bleve.NewMatchQuery(opts.Query)
mq.Analyzer = en.AnalyzerName
tq := bleve.NewTermQuery(opts.Query)
q = bleve.NewDisjunctionQuery(mq, tq)
}
if reverse {
bhQuery = *bhQuery.Reverse()
req := bleve.NewSearchRequest(q)
if opts.Results > 0 {
req.Size = opts.Results
}
req.Highlight = bleve.NewHighlightWithStyle("html")
out := []entity.Bookmark{}
err := m.db.store.ForEach(&bhQuery,
func(bm *entity.Bookmark) error {
out = append(out, *bm)
return nil
})
sr, err := m.db.bleve.Search(req)
if err != nil {
panic(err)
}
// log.Printf("%#v", m.db.bleve.StatsMap())
if sr.Total > 0 {
for _, dm := range sr.Hits {
id, _ := strconv.ParseUint(dm.ID, 10, 64)
bm := m.LoadBookmarkByID(id)
bsr := entity.BookmarkSearchResult{
Bookmark: bm,
Score: dm.Score,
Highlight: template.HTML(strings.Join(dm.Fragments["Info.RawText"], "\n")),
}
found = append(found, bsr)
}
}
m.db.IncrementSearches()
return out, nil
return found, nil
}
func (m *BookmarkManager) ScrapeAndIndex(bm *entity.Bookmark) error {
@@ -205,9 +184,11 @@ func (m *BookmarkManager) ScrapeAndIndex(bm *entity.Bookmark) error {
}
func (m *BookmarkManager) UpdateIndexForBookmark(bm *entity.Bookmark) {
words := content.Words(bm)
words = append(words, bm.Tags...)
m.db.UpdateIndexForWordsByID(words, bm.ID)
log.Printf("inserting into bleve data for %s", bm.URL)
err := m.db.bleve.Index(fmt.Sprint(bm.ID), bm)
if err != nil {
panic(err)
}
}
func (m *BookmarkManager) QueueScrape(bm *entity.Bookmark) {
@@ -281,6 +262,17 @@ func (m *BookmarkManager) UpdateContent() {
}
}
// AllBookmarks returns all bookmarks. It does not use the index for this
// operation.
func (m *BookmarkManager) AllBookmarks() ([]entity.Bookmark, error) {
bookmarks := make([]entity.Bookmark, 0)
err := m.db.store.Find(&bookmarks, &bolthold.Query{})
if err != nil {
panic(err)
}
return bookmarks, nil
}
func (m *BookmarkManager) Stats() (entity.DBStats, error) {
stats := entity.DBStats{}
err := m.db.store.Get("stats", &stats)
@@ -293,5 +285,25 @@ func (m *BookmarkManager) Stats() (entity.DBStats, error) {
return stats, fmt.Errorf("could not load db file size: %s", err)
}
stats.FileSize = int(fi.Size())
indexSize, err := getBleveIndexSize(m.db.file + ".bleve")
if err != nil {
return entity.DBStats{}, err
}
stats.IndexSize = int(indexSize)
return stats, nil
}
func getBleveIndexSize(path string) (int64, error) {
var size int64
err := filepath.Walk(path, func(_ string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if !info.IsDir() {
size += info.Size()
}
return nil
})
return size, err
}

View File

@@ -4,6 +4,11 @@ import (
"fmt"
"time"
"github.com/blevesearch/bleve/v2"
"github.com/blevesearch/bleve/v2/analysis/analyzer/keyword"
"github.com/blevesearch/bleve/v2/analysis/lang/en"
"github.com/blevesearch/bleve/v2/mapping"
"github.com/tardisx/linkwallet/entity"
bolthold "github.com/timshannon/bolthold"
)
@@ -11,19 +16,70 @@ import (
type DB struct {
store *bolthold.Store
file string
bleve bleve.Index
}
func (db *DB) Open(path string) error {
// Open opens the bookmark boltdb, and the bleve index. It returns
// true if the index was newly created, so the caller knows all bookmarks
// need to be re-scraped
func (db *DB) Open(path string) (bool, error) {
// options := bolthold.DefaultOptions
// options.Dir = dir
// options.ValueDir = dir
rescrapeNeeded := false
store, err := bolthold.Open(path, 0666, nil)
if err != nil {
return fmt.Errorf("cannot open '%s' - %s", path, err)
return false, fmt.Errorf("cannot open '%s' - %s", path, err)
}
blevePath := path + ".bleve"
index, err := bleve.New(blevePath, createIndexMapping())
if err != nil {
if err == bleve.ErrorIndexPathExists {
index, err = bleve.Open(blevePath)
if err != nil {
return false, fmt.Errorf("cannot open bleve '%s' - %s", path, err)
}
} else {
return false, fmt.Errorf("cannot open bleve '%s' - %s", path, err)
}
} else {
// we just created an index, one didn't exist, so we need to queue
// all bookmarks to be scraped
rescrapeNeeded = true
}
db.store = store
db.file = path
return nil
db.bleve = index
return rescrapeNeeded, nil
}
func createIndexMapping() mapping.IndexMapping {
indexMapping := bleve.NewIndexMapping()
englishTextFieldMapping := bleve.NewTextFieldMapping()
englishTextFieldMapping.Analyzer = en.AnalyzerName
// a generic reusable mapping for keyword text
keywordFieldMapping := bleve.NewTextFieldMapping()
keywordFieldMapping.Analyzer = keyword.Name
pageInfoMapping := bleve.NewDocumentMapping()
pageInfoMapping.AddFieldMappingsAt("Title", englishTextFieldMapping)
pageInfoMapping.AddFieldMappingsAt("Size", bleve.NewNumericFieldMapping())
pageInfoMapping.AddFieldMappingsAt("RawText", englishTextFieldMapping)
bookmarkMapping := bleve.NewDocumentMapping()
bookmarkMapping.AddFieldMappingsAt("URL", bleve.NewTextFieldMapping())
bookmarkMapping.AddFieldMappingsAt("Tags", keywordFieldMapping)
bookmarkMapping.AddSubDocumentMapping("Info", pageInfoMapping)
indexMapping.AddDocumentMapping("bookmark", bookmarkMapping)
return indexMapping
}
func (db *DB) Close() {
@@ -73,17 +129,11 @@ func (db *DB) UpdateBookmarkStats() error {
}
// count bookmarks and words indexed
bmI := entity.Bookmark{}
wiI := entity.WordIndex{}
bookmarkCount, err := db.store.TxCount(txn, &bmI, &bolthold.Query{})
if err != nil {
txn.Rollback()
return fmt.Errorf("could not get bookmark count: %s", err)
}
indexWordCount, err := db.store.TxCount(txn, &wiI, &bolthold.Query{})
if err != nil {
txn.Rollback()
return fmt.Errorf("could not get index word count: %s", err)
}
// bucket these stats by day
now := time.Now().Truncate(time.Hour * 24)
@@ -97,7 +147,7 @@ func (db *DB) UpdateBookmarkStats() error {
if stats.History == nil {
stats.History = make(map[time.Time]entity.BookmarkInfo)
}
stats.History[now] = entity.BookmarkInfo{Bookmarks: bookmarkCount, IndexedWords: indexWordCount}
stats.History[now] = entity.BookmarkInfo{Bookmarks: bookmarkCount}
err = db.store.TxUpsert(txn, "stats", &stats)
if err != nil {
txn.Rollback()

View File

@@ -1,80 +1,24 @@
package db
import (
"log"
"github.com/tardisx/linkwallet/entity"
bolthold "github.com/timshannon/bolthold"
)
func (db *DB) InitIndices() {
wi := entity.WordIndex{}
db.store.DeleteMatching(wi, &bolthold.Query{})
panic("unimplemented")
// wi := entity.WordIndex{}
// db.store.DeleteMatching(wi, &bolthold.Query{})
}
func (db *DB) UpdateIndexForWordsByID(words []string, id uint64) {
// delete this id from all indices
txn, err := db.store.Bolt().Begin(true)
if err != nil {
panic(err)
}
db.store.TxForEach(txn, &bolthold.Query{}, func(wi *entity.WordIndex) error {
delete(wi.Bitmap, id)
// if the index is now completely empty, nuke it entirely
empty := true
for _, v := range wi.Bitmap {
if v {
empty = false
break
}
}
// func (db *DB) IndexDocument(id uint64, info entity.PageInfo) {
// log.Printf("I am indexing!")
// err := db.bleve.Index(fmt.Sprint(id), info)
// if err != nil {
// panic(err)
// }
// }
if empty {
db.store.TxDelete(txn, "word_index_"+wi.Word, wi)
} else {
db.store.TxUpdate(txn, "word_index_"+wi.Word, wi)
}
return nil
})
// adding
for i, word := range words {
// log.Printf("indexing %s", word)
thisWI := entity.WordIndex{Word: word}
err := db.store.TxGet(txn, "word_index_"+word, &thisWI)
if err == bolthold.ErrNotFound {
// create it
thisWI.Bitmap = map[uint64]bool{}
} else if err != nil {
panic(err)
}
thisWI.Bitmap[id] = true
err = db.store.TxUpsert(txn, "word_index_"+word, thisWI)
if err != nil {
panic(err)
}
if i > 0 && i%100 == 0 {
txn.Commit()
txn, err = db.store.Bolt().Begin(true)
if err != nil {
panic(err)
}
}
}
txn.Commit()
}
// func (db *DB) UpdateIndexForWordsByID(words []string, id uint64) {
// panic("I should not be called")
// }
func (db *DB) DumpIndex() {
err := db.store.ForEach(&bolthold.Query{}, func(wi *entity.WordIndex) error {
log.Printf("%10s: %v", wi.Word, wi.Bitmap)
return nil
})
if err != nil {
panic(err)
}
panic("unimplemented")
}

View File

@@ -5,6 +5,10 @@ import (
"net/http/httptest"
"os"
"testing"
"time"
"github.com/blevesearch/bleve/v2"
"github.com/blevesearch/bleve/v2/search/query"
"github.com/tardisx/linkwallet/entity"
)
@@ -141,3 +145,113 @@ func TestTagIndexing(t *testing.T) {
t.Error("did not get one id for sloth")
}
}
func testBM() entity.Bookmark {
return entity.Bookmark{
ID: 1,
URL: "https://one.com",
Info: entity.PageInfo{
Fetched: time.Time{},
Title: "one web",
Size: 200,
StatusCode: 200,
RawText: "one web site is great for all humans",
},
Tags: []string{"hello", "big friends"},
PreserveTitle: false,
TimestampCreated: time.Time{},
TimestampLastScraped: time.Time{},
}
}
func TestMappings(t *testing.T) {
mapping := createIndexMapping()
idx, err := bleve.NewMemOnly(mapping)
if err != nil {
t.Error(err)
t.FailNow()
}
bm := testBM()
err = idx.Index("1", bm)
if err != nil {
panic(err)
}
type tc struct {
query query.Query
expHits int
}
tcs := []tc{
{query: bleve.NewMatchQuery("human"), expHits: 1},
{query: bleve.NewMatchQuery("humanoid"), expHits: 0},
{query: bleve.NewMatchQuery("hello"), expHits: 1},
{query: bleve.NewMatchQuery("big"), expHits: 0},
{query: bleve.NewMatchQuery("friends"), expHits: 0},
{query: bleve.NewMatchQuery("big friend"), expHits: 0},
{query: bleve.NewTermQuery("big friends"), expHits: 1},
{query: bleve.NewMatchQuery("web great"), expHits: 1},
}
for i := range tcs {
q := tcs[i].query
sr, err := idx.Search(bleve.NewSearchRequest(q))
if err != nil {
t.Error(err)
} else {
if len(sr.Hits) != tcs[i].expHits {
t.Errorf("wrong hits - expected %d got %d for %s", tcs[i].expHits, len(sr.Hits), tcs[i].query)
}
}
}
}
func TestMappingsDisjunctionQuery(t *testing.T) {
mapping := createIndexMapping()
idx, err := bleve.NewMemOnly(mapping)
if err != nil {
t.Error(err)
t.FailNow()
}
bm := testBM()
err = idx.Index("1", bm)
if err != nil {
panic(err)
}
type tc struct {
query string
expHits int
}
tcs := []tc{
{query: "human", expHits: 1},
{query: "humanoid", expHits: 0},
{query: "hello", expHits: 1},
{query: "big", expHits: 0},
{query: "friends", expHits: 0},
{query: "big friend", expHits: 0},
{query: "big friends", expHits: 1},
{query: "web great", expHits: 1},
}
for i := range tcs {
q := tcs[i].query
req := bleve.NewDisjunctionQuery(
bleve.NewMatchQuery(q),
bleve.NewTermQuery(q),
)
sr, err := idx.Search(bleve.NewSearchRequest(req))
if err != nil {
t.Error(err)
} else {
if len(sr.Hits) != tcs[i].expHits {
t.Errorf("wrong hits - expected %d got %d for %s", tcs[i].expHits, len(sr.Hits), tcs[i].query)
}
}
}
}