Start of blevification

2025-04-25 23:57:04 +09:30
parent 3a5fc1d66d
commit 9b15528510
9 changed files with 289 additions and 316 deletions
@@ -2,14 +2,11 @@ package content

 import (
 	"log"
-	"strings"
 	"time"
-	"unicode"

 	"github.com/tardisx/linkwallet/entity"

 	"github.com/gocolly/colly"
-	snowballeng "github.com/kljensen/snowball/english"
 )

 func FetchPageInfo(bm entity.Bookmark) entity.PageInfo {
@@ -50,50 +47,50 @@ func FetchPageInfo(bm entity.Bookmark) entity.PageInfo {
 	return info
 }

-func Words(bm *entity.Bookmark) []string {
-	words := []string{}
+// func Words(bm *entity.Bookmark) []string {
+// 	words := []string{}

-	words = append(words, StringToStemmedSearchWords(bm.Info.RawText)...)
-	words = append(words, StringToStemmedSearchWords(bm.Info.Title)...)
-	words = append(words, StringToStemmedSearchWords(bm.URL)...)
-	return words
-}
+// 	words = append(words, StringToStemmedSearchWords(bm.Info.RawText)...)
+// 	words = append(words, StringToStemmedSearchWords(bm.Info.Title)...)
+// 	words = append(words, StringToStemmedSearchWords(bm.URL)...)
+// 	return words
+// }

-// StringToStemmedSearchWords returns a list of stemmed words with stop words
-// removed.
-func StringToStemmedSearchWords(s string) []string {
-	words := []string{}
+// // StringToStemmedSearchWords returns a list of stemmed words with stop words
+// // removed.
+// func StringToStemmedSearchWords(s string) []string {
+// 	words := []string{}

-	words = append(words, stemmerFilter(stopwordFilter(tokenize(s)))...)
-	return words
-}
+// 	words = append(words, stemmerFilter(stopwordFilter(tokenize(s)))...)
+// 	return words
+// }

-func tokenize(text string) []string {
-	return strings.FieldsFunc(text, func(r rune) bool {
-		// Split on any character that is not a letter or a number.
-		return !unicode.IsLetter(r) && !unicode.IsNumber(r)
-	})
-}
+// func tokenize(text string) []string {
+// 	return strings.FieldsFunc(text, func(r rune) bool {
+// 		// Split on any character that is not a letter or a number.
+// 		return !unicode.IsLetter(r) && !unicode.IsNumber(r)
+// 	})
+// }

-func stemmerFilter(tokens []string) []string {
-	r := make([]string, len(tokens))
-	for i, token := range tokens {
-		r[i] = snowballeng.Stem(token, false)
-	}
-	return r
-}
+// func stemmerFilter(tokens []string) []string {
+// 	r := make([]string, len(tokens))
+// 	for i, token := range tokens {
+// 		r[i] = snowballeng.Stem(token, false)
+// 	}
+// 	return r
+// }

-var stopwords = map[string]struct{}{ // I wish Go had built-in sets.
-	"a": {}, "and": {}, "be": {}, "have": {}, "i": {},
-	"in": {}, "of": {}, "that": {}, "the": {}, "to": {},
-}
+// var stopwords = map[string]struct{}{ // I wish Go had built-in sets.
+// 	"a": {}, "and": {}, "be": {}, "have": {}, "i": {},
+// 	"in": {}, "of": {}, "that": {}, "the": {}, "to": {},
+// }

-func stopwordFilter(tokens []string) []string {
-	r := make([]string, 0, len(tokens))
-	for _, token := range tokens {
-		if _, ok := stopwords[token]; !ok {
-			r = append(r, token)
-		}
-	}
-	return r
-}
+// func stopwordFilter(tokens []string) []string {
+// 	r := make([]string, 0, len(tokens))
+// 	for _, token := range tokens {
+// 		if _, ok := stopwords[token]; !ok {
+// 			r = append(r, token)
+// 		}
+// 	}
+// 	return r
+// }
@@ -47,45 +47,45 @@ func TestSimpleScrape(t *testing.T) {
 	}
 }

-func TestWords(t *testing.T) {
+// func TestWords(t *testing.T) {

-	bm := entity.Bookmark{
-		//		ID:                   0,
-		//		URL:                  "",
-		Info: entity.PageInfo{RawText: "the quick brown fox jumped over the lazy dog"},
-		//		Tags:                 []string{},
-	}
-	words := Words(&bm)
-	if len(words) != 7 {
-		t.Errorf("got %d words not 7", len(words))
-	} else {
-		if words[0] != "quick" ||
-			words[1] != "brown" ||
-			words[2] != "fox" ||
-			words[3] != "jump" ||
-			words[4] != "over" ||
-			words[5] != "lazi" ||
-			words[6] != "dog" {
-			t.Error("incorrect words returned")
-		}
-	}
-}
+// 	bm := entity.Bookmark{
+// 		//		ID:                   0,
+// 		//		URL:                  "",
+// 		Info: entity.PageInfo{RawText: "the quick brown fox jumped over the lazy dog"},
+// 		//		Tags:                 []string{},
+// 	}
+// 	words := Words(&bm)
+// 	if len(words) != 7 {
+// 		t.Errorf("got %d words not 7", len(words))
+// 	} else {
+// 		if words[0] != "quick" ||
+// 			words[1] != "brown" ||
+// 			words[2] != "fox" ||
+// 			words[3] != "jump" ||
+// 			words[4] != "over" ||
+// 			words[5] != "lazi" ||
+// 			words[6] != "dog" {
+// 			t.Error("incorrect words returned")
+// 		}
+// 	}
+// }

-func TestStemmer(t *testing.T) {
-	s := `quick quick fox 😂 smile http://google.com`
-	words1 := StringToStemmedSearchWords(s)
-	t.Log(words1)
-	if len(words1) != 7 {
-		t.Error("wrong number of words")
-	}
-	if words1[0] != "quick" ||
-		words1[1] != "quick" ||
-		words1[2] != "fox" ||
-		words1[3] != "smile" ||
-		words1[4] != "http" ||
-		words1[5] != "googl" ||
-		words1[6] != "com" {
-		t.Error("bad words")
-	}
+// func TestStemmer(t *testing.T) {
+// 	s := `quick quick fox 😂 smile http://google.com`
+// 	words1 := StringToStemmedSearchWords(s)
+// 	t.Log(words1)
+// 	if len(words1) != 7 {
+// 		t.Error("wrong number of words")
+// 	}
+// 	if words1[0] != "quick" ||
+// 		words1[1] != "quick" ||
+// 		words1[2] != "fox" ||
+// 		words1[3] != "smile" ||
+// 		words1[4] != "http" ||
+// 		words1[5] != "googl" ||
+// 		words1[6] != "com" {
+// 		t.Error("bad words")
+// 	}

-}
+// }