├── contrib └── Dockerfile ├── http.go ├── main.go ├── parser.go ├── readme.md └── repo.go /contrib/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | RUN apt-get -y update && \ 4 | apt-get install -y curl wget git && \ 5 | curl https://raw.githubusercontent.com/Restream/reindexer/master/dependencies.sh | bash -s 6 | 7 | RUN wget https://redirector.gvt1.com/edgedl/go/go1.9.3.linux-amd64.tar.gz -nv -O - | tar xzv -C /usr/local 8 | ENV GOROOT=/usr/local/go 9 | ENV GOPATH=/go 10 | ENV PATH=$PATH:$GOROOT/bin:$GOPATH/bin 11 | 12 | RUN go get github.com/buaazp/fasthttprouter && \ 13 | go get github.com/PuerkitoBio/goquery && \ 14 | go get github.com/nfnt/resize 15 | 16 | RUN apt-get -y install libgoogle-perftools-dev 17 | -------------------------------------------------------------------------------- /http.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // Import package 4 | import ( 5 | "encoding/json" 6 | "fmt" 7 | "log" 8 | "os" 9 | "path" 10 | "strconv" 11 | "time" 12 | 13 | "github.com/buaazp/fasthttprouter" 14 | "github.com/valyala/fasthttp" 15 | ) 16 | 17 | type ErrorResponce struct { 18 | Success bool `json:"success"` 19 | Error string `json:"error"` 20 | } 21 | 22 | type HabrPostView struct { 23 | *HabrPost 24 | Link string `json:"link"` 25 | Image string `json:"image"` 26 | } 27 | 28 | type PostsResponce struct { 29 | Items []HabrPostView `json:"items"` 30 | TotalCount int `json:"total_count,omitempty"` 31 | ElapsedMs int64 `json:"elapsed_ms,omitempty"` 32 | Success bool `json:"success"` 33 | } 34 | 35 | type HabrCommentView struct { 36 | *HabrComment 37 | Link string `json:"link"` 38 | } 39 | 40 | type CommentsResponce struct { 41 | Items []HabrCommentView `json:"items"` 42 | TotalCount int `json:"total_count,omitempty"` 43 | ElapsedMs int64 `json:"elapsed_ms,omitempty"` 44 | Success bool `json:"success"` 45 | } 46 | 47 | func respError(ctx *fasthttp.RequestCtx, httpCode int, err error) { 48 | resp := ErrorResponce{ 49 | Success: false, 50 | Error: err.Error(), 51 | } 52 | ctx.SetStatusCode(httpCode) 53 | ret, _ := json.Marshal(resp) 54 | ctx.Write(ret) 55 | } 56 | 57 | func respJSON(ctx *fasthttp.RequestCtx, data interface{}) { 58 | ctx.SetStatusCode(200) 59 | ctx.SetContentType("application/json; charset=utf-8") 60 | ctx.Response.Header.Add("Connection", "keep-alive") 61 | ret, _ := json.Marshal(data) 62 | ctx.Write(ret) 63 | } 64 | 65 | func convertComments(in []*HabrComment) (out []HabrCommentView) { 66 | out = make([]HabrCommentView, 0, len(in)) 67 | for _, comment := range in { 68 | cv := HabrCommentView{ 69 | HabrComment: comment, 70 | Link: fmt.Sprintf("https://habrahabr.ru/post/%d/#comment_%d", comment.PostID, comment.ID), 71 | } 72 | out = append(out, cv) 73 | } 74 | return out 75 | } 76 | 77 | func convertPosts(in []*HabrPost) (out []HabrPostView) { 78 | out = make([]HabrPostView, 0, len(in)) 79 | for _, post := range in { 80 | pv := HabrPostView{ 81 | HabrPost: post, 82 | Link: fmt.Sprintf("https://habrahabr.ru/post/%d/", post.ID), 83 | } 84 | if post.HasImage { 85 | pv.Image = fmt.Sprintf("/images/%d.jpeg", post.ID) 86 | } 87 | 88 | out = append(out, pv) 89 | } 90 | return out 91 | } 92 | 93 | func SearchPosts(ctx *fasthttp.RequestCtx) { 94 | text := string(ctx.QueryArgs().Peek("query")) 95 | limit, _ := ctx.QueryArgs().GetUint("limit") 96 | offset, _ := ctx.QueryArgs().GetUint("offset") 97 | sortBy := string(ctx.QueryArgs().Peek("sort_by")) 98 | sortDesc, _ := ctx.QueryArgs().GetUint("sort_desc") 99 | 100 | t := time.Now() 101 | items, total, err := repo.SearchPosts(text, offset, limit, sortBy, sortDesc > 0) 102 | 103 | if err != nil { 104 | respError(ctx, 502, err) 105 | return 106 | } 107 | 108 | resp := PostsResponce{ 109 | Items: convertPosts(items), 110 | TotalCount: total, 111 | ElapsedMs: int64(time.Now().Sub(t) / time.Millisecond), 112 | Success: true, 113 | } 114 | 115 | respJSON(ctx, resp) 116 | } 117 | 118 | func GetPostsHandler(ctx *fasthttp.RequestCtx) { 119 | user := string(ctx.QueryArgs().Peek("user")) 120 | limit, _ := ctx.QueryArgs().GetUint("limit") 121 | offset, _ := ctx.QueryArgs().GetUint("offset") 122 | startTime, _ := ctx.QueryArgs().GetUint("start_time") 123 | endTime, _ := ctx.QueryArgs().GetUint("end_time") 124 | withComments, _ := ctx.QueryArgs().GetUint("with_comments") 125 | 126 | t := time.Now() 127 | items, total, err := repo.GetPosts(offset, limit, user, startTime, endTime, withComments > 0) 128 | 129 | if err != nil { 130 | respError(ctx, 502, err) 131 | return 132 | } 133 | resp := PostsResponce{ 134 | Items: convertPosts(items), 135 | TotalCount: total, 136 | ElapsedMs: int64(time.Now().Sub(t) / time.Millisecond), 137 | Success: true, 138 | } 139 | 140 | respJSON(ctx, resp) 141 | } 142 | 143 | func SearchComments(ctx *fasthttp.RequestCtx) { 144 | text := string(ctx.QueryArgs().Peek("query")) 145 | limit, _ := ctx.QueryArgs().GetUint("limit") 146 | offset, _ := ctx.QueryArgs().GetUint("offset") 147 | sortBy := string(ctx.QueryArgs().Peek("sort_by")) 148 | sortDesc, _ := ctx.QueryArgs().GetUint("sort_desc") 149 | 150 | t := time.Now() 151 | items, total, err := repo.SearchComments(text, offset, limit, sortBy, sortDesc > 0) 152 | 153 | if err != nil { 154 | respError(ctx, 502, err) 155 | return 156 | } 157 | 158 | resp := CommentsResponce{ 159 | Items: convertComments(items), 160 | TotalCount: total, 161 | ElapsedMs: int64(time.Now().Sub(t) / time.Millisecond), 162 | Success: true, 163 | } 164 | 165 | respJSON(ctx, resp) 166 | } 167 | 168 | func SearchHandler(ctx *fasthttp.RequestCtx) { 169 | sortBy := string(ctx.QueryArgs().Peek("search_type")) 170 | switch sortBy { 171 | case "posts", "": 172 | SearchPosts(ctx) 173 | case "comments": 174 | SearchComments(ctx) 175 | default: 176 | respError(ctx, 401, fmt.Errorf("Invalid search_type. Valid values are: 'comments' or 'posts'")) 177 | } 178 | 179 | } 180 | 181 | func GetPostHandler(ctx *fasthttp.RequestCtx) { 182 | id, _ := strconv.Atoi(ctx.UserValue("id").(string)) 183 | withComments, _ := ctx.QueryArgs().GetUint("with_comments") 184 | 185 | item, err := repo.GetPost(id, withComments > 0) 186 | 187 | if err != nil { 188 | respError(ctx, 502, err) 189 | return 190 | } 191 | 192 | respJSON(ctx, item) 193 | } 194 | 195 | func ConfigureHandler(ctx *fasthttp.RequestCtx) { 196 | ns := ctx.UserValue("ns").(string) 197 | var newCfg FTConfig 198 | err := json.Unmarshal(ctx.PostBody(), &newCfg) 199 | if err != nil { 200 | respError(ctx, 502, err) 201 | return 202 | } 203 | err = repo.SetFTConfig(ns, newCfg) 204 | if err != nil { 205 | respError(ctx, 502, err) 206 | return 207 | } 208 | ctx.WriteString("ok") 209 | return 210 | } 211 | 212 | func GetDocHandler(ctx *fasthttp.RequestCtx) { 213 | urlPath := string(ctx.Path()) 214 | 215 | target := path.Join(*webRootPath, urlPath) 216 | 217 | f, err := os.Stat(target) 218 | if err != nil || f.IsDir() { 219 | target = path.Join(*webRootPath, "index.html") 220 | } 221 | 222 | log.Printf("%s", target) 223 | 224 | ctx.SendFile(target) 225 | 226 | } 227 | 228 | func HandlerWrapper(handler func(ctx *fasthttp.RequestCtx)) func(ctx *fasthttp.RequestCtx) { 229 | return func(ctx *fasthttp.RequestCtx) { 230 | 231 | ip := ctx.RemoteIP().String() 232 | if ip != "127.0.0.1" && ip != "188.120.235.218" { 233 | ctx.SetStatusCode(401) 234 | ctx.WriteString("bad IP - gone") 235 | return 236 | } 237 | 238 | t := time.Now() 239 | handler(ctx) 240 | latency := time.Now().Sub(t) 241 | 242 | log.Printf( 243 | "%s %s %s %d %d %v %s", 244 | ip, 245 | string(ctx.Method()), 246 | string(ctx.RequestURI()), 247 | ctx.Response.StatusCode(), 248 | len(ctx.Response.Body()), 249 | latency, 250 | string(ctx.UserAgent()), 251 | ) 252 | } 253 | } 254 | 255 | func StartHTTP(addr string) { 256 | router := fasthttprouter.New() 257 | router.GET("/api/search", SearchHandler) 258 | router.GET("/api/posts/:id", GetPostHandler) 259 | router.GET("/api/posts", GetPostsHandler) 260 | // router.POST("/api/configure/:ns", ConfigureHandler) 261 | router.GET("/images/*filepath", GetDocHandler) 262 | router.GET("/static/*filepath", GetDocHandler) 263 | router.GET("/index.html", GetDocHandler) 264 | router.GET("/search", GetDocHandler) 265 | router.GET("/", GetDocHandler) 266 | log.Printf("Starting listen fasthttp on %s", addr) 267 | if err := fasthttp.ListenAndServe(addr, HandlerWrapper(router.Handler)); err != nil { 268 | panic(err) 269 | } 270 | } 271 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "flag" 6 | "fmt" 7 | "io/ioutil" 8 | "log" 9 | "os" 10 | "path/filepath" 11 | "sync" 12 | "time" 13 | ) 14 | 15 | var repo Repo 16 | 17 | var numParallelImports = 4 18 | 19 | var httpAddr = flag.String("httpaddr", ":8881", "HTTP listen address:port") 20 | var importStartID = flag.Int("startid", 353800, "Import post start ID") 21 | var importFinishID = flag.Int("finishid", 355000, "Import post finish ID") 22 | var dumpPostsPath = flag.String("dumppath", "/Users/ogerasimov/habrimport", "Path, where imported posts are stored") 23 | var webRootPath = flag.String("webrootpath", "/Users/ogerasimov/habrdemo-static", "Path, where HTML static data is hosted") 24 | var syncTimeout = flag.Int("synctimeout", 30, "Sync timeout in minutes") 25 | 26 | func dload(wg *sync.WaitGroup, dlChannel chan int) { 27 | for i := range dlChannel { 28 | habrPost, imgData, err := DownloadPost(i) 29 | if habrPost != nil && err == nil { 30 | fmt.Printf("ID %d (at %s) - %s, %d comments, %d views, %d likes, %d bookmarks\n", 31 | i, time.Unix(habrPost.Time, 0).Format("02.01.06"), habrPost.Title, len(habrPost.Comments), habrPost.Views, habrPost.Likes, habrPost.Favorites) 32 | data, _ := json.Marshal(habrPost) 33 | ioutil.WriteFile(fmt.Sprintf("%s/%d.json", *dumpPostsPath, i), data, 0666) 34 | 35 | if imgData != nil { 36 | ioutil.WriteFile(fmt.Sprintf("%s/%d.jpeg", filepath.Join(*webRootPath, "images"), i), imgData, 0666) 37 | 38 | } 39 | } else { 40 | // fmt.Printf("ID %d - error %s\n", i, err.Error()) 41 | } 42 | } 43 | wg.Done() 44 | } 45 | 46 | func downloadFiles() { 47 | dlChannel := make(chan int) 48 | wg := sync.WaitGroup{} 49 | os.Mkdir(*dumpPostsPath, os.ModePerm) 50 | os.Mkdir(filepath.Join(*webRootPath, "images"), os.ModePerm) 51 | 52 | for i := 0; i < numParallelImports; i++ { 53 | wg.Add(1) 54 | go dload(&wg, dlChannel) 55 | } 56 | for i := *importStartID; i < *importFinishID; i++ { 57 | dlChannel <- i 58 | } 59 | 60 | close(dlChannel) 61 | wg.Wait() 62 | } 63 | 64 | func syncDataRoutine() { 65 | for { 66 | time.Sleep(time.Duration(*syncTimeout) * time.Minute) 67 | log.Printf("Syncing...") 68 | log.Printf("Downloading posts from ID %d to %d", *importStartID, *importFinishID) 69 | downloadFiles() 70 | log.Printf("Updating posts from ID %d to %d", *importStartID, *importFinishID) 71 | repo.RestoreRangeFromFiles(*dumpPostsPath, *importStartID, *importFinishID) 72 | repo.Done() 73 | repo.Init() 74 | } 75 | } 76 | 77 | func usage() { 78 | fmt.Printf( 79 | "usage: %s []\n"+ 80 | "The available commands are:\n"+ 81 | " run Run HTTP API server\n"+ 82 | " import Import posts from habrhabr site\n"+ 83 | " load Load imported data to reindexer\n", 84 | os.Args[0], 85 | ) 86 | os.Exit(-1) 87 | 88 | } 89 | 90 | func main() { 91 | if len(os.Args) < 2 { 92 | usage() 93 | } 94 | 95 | flag.CommandLine.Parse(os.Args[2:]) 96 | 97 | switch os.Args[1] { 98 | case "run": 99 | repo.Init() 100 | repo.WarmUp() 101 | go syncDataRoutine() 102 | StartHTTP(*httpAddr) 103 | case "import": 104 | downloadFiles() 105 | case "load": 106 | os.RemoveAll("/var/lib/reindexer/habr") 107 | repo.Init() 108 | repo.RestoreAllFromFiles(*dumpPostsPath) 109 | repo.Done() 110 | default: 111 | usage() 112 | } 113 | 114 | } 115 | -------------------------------------------------------------------------------- /parser.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "image" 7 | "image/gif" 8 | "image/jpeg" 9 | "image/png" 10 | "net/http" 11 | "strconv" 12 | "strings" 13 | "time" 14 | 15 | "github.com/PuerkitoBio/goquery" 16 | "github.com/nfnt/resize" 17 | ) 18 | 19 | var months = map[string]int{"января": 1, "февраля": 2, "марта": 3, "апреля": 4, "мая": 5, "июня": 6, "июля": 7, "августа": 8, "сентября": 9, "октября": 10, "ноября": 11, "декабря": 12} 20 | 21 | func parseTime(htime string) (t time.Time, err error) { 22 | t = time.Now() 23 | timeParts := strings.Split(strings.Trim(htime, " "), " ") 24 | strDateTime := "" 25 | timeIdx := 2 26 | 27 | if len(timeParts[0]) == 8 && timeParts[0][2] == '.' { 28 | strDateTime = fmt.Sprintf("20%s-%s-%s", timeParts[0][6:8], timeParts[0][3:5], timeParts[0][0:2]) 29 | } else if timeParts[0] == "сегодня" { 30 | strDateTime = fmt.Sprintf("%04d-%02d-%02d", t.Year(), t.Month(), t.Day()) 31 | } else if timeParts[0] == "вчера" { 32 | t = t.Add(-time.Hour * 24) 33 | strDateTime = fmt.Sprintf("%04d-%02d-%02d", t.Year(), t.Month(), t.Day()) 34 | } else if len(timeParts) > 2 { 35 | timeIdx++ 36 | month, ok := months[timeParts[1]] 37 | if !ok { 38 | month = 1 39 | } 40 | year, err := strconv.Atoi(timeParts[2]) 41 | if err == nil { 42 | timeIdx++ 43 | } else { 44 | year = t.Year() 45 | } 46 | day, err := strconv.Atoi(timeParts[0]) 47 | if err != nil { 48 | day = 1 49 | } 50 | strDateTime = fmt.Sprintf("%04d-%02d-%02d", year, month, day) 51 | } 52 | 53 | if timeIdx < len(timeParts) { 54 | strDateTime += "T" + timeParts[timeIdx] + ":00+03:00" 55 | 56 | t, err = time.Parse(time.RFC3339, strDateTime) 57 | } else { 58 | err = fmt.Errorf("Can't parse time %s", htime) 59 | } 60 | 61 | return t, err 62 | 63 | } 64 | 65 | func downloadAndResizeImage(url string) (out []byte, err error) { 66 | resp, err := http.Get(url) 67 | 68 | if err != nil { 69 | return nil, err 70 | } 71 | defer resp.Body.Close() 72 | 73 | if resp.StatusCode != 200 { 74 | return nil, fmt.Errorf("%s - Got %d status", url, resp.StatusCode) 75 | } 76 | 77 | ctype := resp.Header.Get("content-type") 78 | 79 | var img image.Image 80 | 81 | switch ctype { 82 | case "image/png": 83 | img, err = png.Decode(resp.Body) 84 | case "image/jpeg", "image/jpg": 85 | img, err = jpeg.Decode(resp.Body) 86 | case "image/gif": 87 | img, err = gif.Decode(resp.Body) 88 | default: 89 | return nil, fmt.Errorf("%s - Unknown image type %s", url, ctype) 90 | } 91 | 92 | if err != nil { 93 | return nil, err 94 | } 95 | 96 | // fmt.Printf("%s -> %s (%d,%d)\n", url, ctype, img.Bounds().Size().X, img.Bounds().Size().Y) 97 | 98 | img = resize.Thumbnail(100, 100, img, resize.Lanczos3) 99 | 100 | var buf bytes.Buffer 101 | jpeg.Encode(&buf, img, nil) 102 | 103 | return buf.Bytes(), nil 104 | } 105 | 106 | func DownloadPost(ID int) (*HabrPost, []byte, error) { 107 | 108 | url := fmt.Sprintf("https://habrahabr.ru/post/%d/", ID) 109 | 110 | doc, err := goquery.NewDocument(url) 111 | 112 | if err != nil { 113 | return nil, nil, err 114 | } 115 | 116 | var dpost, dcomments, dstats *goquery.Selection 117 | 118 | doc.Find("div").Each(func(i int, s *goquery.Selection) { 119 | if className, ok := s.Attr("class"); ok { 120 | if strings.Index(className, "post__wrapper") >= 0 { 121 | dpost = s 122 | } 123 | if strings.Index(className, "comments-section") >= 0 { 124 | dcomments = s 125 | } 126 | if strings.Index(className, "post-additionals") >= 0 { 127 | dstats = s 128 | } 129 | } 130 | }) 131 | 132 | if dpost == nil { 133 | return nil, nil, fmt.Errorf("Data not found") 134 | } 135 | 136 | habrPost := &HabrPost{} 137 | var imgData []byte 138 | dpost.Find("div").Each(func(i int, s *goquery.Selection) { 139 | if className, ok := s.Attr("class"); ok { 140 | if strings.Index(className, "post__text") >= 0 { 141 | habrPost.Text = s.Text() 142 | img := s.Find("img").First() 143 | if img != nil { 144 | if srcURL, ok := img.Attr("src"); ok { 145 | imgData, err = downloadAndResizeImage(srcURL) 146 | if imgData != nil && err == nil { 147 | habrPost.HasImage = true 148 | } 149 | } 150 | } 151 | } 152 | } 153 | }) 154 | 155 | dpost.Find("a").Each(func(i int, s *goquery.Selection) { 156 | if className, ok := s.Attr("class"); ok { 157 | if strings.Index(className, "inline-list__item-link hub-link") >= 0 { 158 | habrPost.Hubs = append(habrPost.Hubs, s.Text()) 159 | } 160 | if strings.Index(className, "inline-list__item-link post__tag") >= 0 { 161 | habrPost.Tags = append(habrPost.Tags, s.Text()) 162 | } 163 | } 164 | }) 165 | 166 | dpost.Find("span").Each(func(i int, s *goquery.Selection) { 167 | if className, ok := s.Attr("class"); ok { 168 | if strings.Index(className, "post__title-text") >= 0 { 169 | habrPost.Title = s.Text() 170 | } 171 | if strings.Index(className, "post__time") >= 0 { 172 | t, err := parseTime(s.Text()) 173 | if err != nil { 174 | fmt.Printf("Error parsing time %s", err.Error()) 175 | } 176 | habrPost.Time = t.Unix() 177 | } 178 | if strings.Index(className, "user-info__nickname") >= 0 { 179 | habrPost.User = s.Text() 180 | } 181 | } 182 | }) 183 | 184 | if dcomments != nil { 185 | dcomments.Find("div").Each(func(i int, s *goquery.Selection) { 186 | if className, ok := s.Attr("class"); ok { 187 | if strings.Index(className, "comment") >= 0 { 188 | comment := &HabrComment{} 189 | comment.ID = ID*1000 + len(habrPost.Comments) 190 | 191 | if commentIDStr, ok := s.Attr("id"); ok { 192 | commentIDStr = strings.TrimPrefix(commentIDStr, "comment_") 193 | if commentID, err := strconv.Atoi(commentIDStr); err == nil { 194 | comment.ID = commentID 195 | } 196 | } 197 | 198 | s.Find("div").Each(func(i int, s *goquery.Selection) { 199 | if className, ok := s.Attr("class"); ok { 200 | if strings.Index(className, "comment__message") >= 0 { 201 | comment.Text = s.Text() 202 | } 203 | } 204 | }) 205 | s.Find("span").Each(func(i int, s *goquery.Selection) { 206 | if className, ok := s.Attr("class"); ok { 207 | if strings.Index(className, "user-info__nickname") >= 0 { 208 | comment.User = s.Text() 209 | } 210 | if strings.Index(className, "voting-wjt__counter") >= 0 { 211 | comment.Likes, _ = strconv.Atoi(s.Text()) 212 | } 213 | } 214 | }) 215 | s.Find("time").Each(func(i int, s *goquery.Selection) { 216 | if className, ok := s.Attr("class"); ok { 217 | if strings.Index(className, "comment__date-time") >= 0 { 218 | t, err := parseTime(s.Text()) 219 | if err != nil { 220 | fmt.Printf("Error parsing time %s", err.Error()) 221 | } 222 | comment.Time = t.Unix() 223 | } 224 | } 225 | }) 226 | 227 | if len(comment.Text) > 0 { 228 | comment.PostID = ID 229 | habrPost.Comments = append(habrPost.Comments, comment) 230 | } 231 | } 232 | } 233 | }) 234 | } 235 | if dstats != nil { 236 | dstats.Find("span").Each(func(i int, s *goquery.Selection) { 237 | if className, ok := s.Attr("class"); ok { 238 | if strings.Index(className, "voting-wjt__counter") >= 0 { 239 | habrPost.Likes, _ = strconv.Atoi(s.Text()) 240 | } 241 | if strings.Index(className, "bookmark__counter") >= 0 { 242 | habrPost.Favorites, _ = strconv.Atoi(s.Text()) 243 | } 244 | if strings.Index(className, "post-stats__views-count") >= 0 { 245 | viewsStr := strings.Replace(s.Text(), ",", ".", -1) 246 | mult := 1.0 247 | if kMultIdx := strings.Index(viewsStr, "k"); kMultIdx >= 0 { 248 | mult = 1000.0 249 | viewsStr = viewsStr[:kMultIdx] 250 | } 251 | 252 | views, _ := strconv.ParseFloat(viewsStr, 64) 253 | habrPost.Views = int(views * mult) 254 | } 255 | } 256 | }) 257 | 258 | } 259 | 260 | habrPost.ID = ID 261 | 262 | return habrPost, imgData, nil 263 | } 264 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Habrahabr livesearch demo application 2 | 3 | This is small proof-of-concept application, which implements live full text search by popular russian site habrahabr.ru. 4 | 5 | The [Reindexer](https://github.com/Restream/reindexer) in-memory DB is used as storage and full text search engine. 6 | 7 | Current dataset contains about 5GB of data: 170K aricles with 6M commentaries. 8 | 9 | The frontend for project is written with vue.js and located in [repository](https://github.com/igtulm/reindex-search-ui) 10 | 11 | ![](https://habrastorage.org/webt/ob/eo/lq/obeolqk0_j5nu0junamkmqwdltq.gif) 12 | 13 | 14 | ## Build & install 15 | 16 | 1. Install & build [reindexers dependencies](https://github.com/Restream/reindexer/blob/master/readme.md#installation) 17 | 2. Install habr-search 18 | ```bash 19 | go get github.com/olegator77/habr-search 20 | ``` 21 | 22 | ## Usage 23 | 24 | 1. Import (download) dataset from habrahabr.ru 25 | ``` 26 | habr-search import -startid 1 -finishid 355000 -dumppath -webrootpath 27 | ``` 28 | 29 | This step is very long, and can take about 8+ hours to download all data, and requires about 5GB of free disk space. You can reduce time 30 | and size by decrease ID range, e.g. set startid to 350000. 31 | 32 | 2. Load imported data to Reindexer 33 | 34 | ``` 35 | habr-search load -dumppath -webrootpath 36 | ``` 37 | 38 | This step takes about 5 minutes for all dataset 39 | 40 | 3. Install and build frontend 41 | 42 | - Follow the [instructions](https://github.com/igtulm/reindex-search-ui) 43 | - Copy built fronened to `webrootpath` folder 44 | 45 | 4. Run service 46 | 47 | ``` 48 | habr-search run -dumppath -webrootpath 49 | ``` 50 | 51 | Open http://127.0.0.1:8881 in your browser. 52 | 53 | -------------------------------------------------------------------------------- /repo.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // Import package 4 | import ( 5 | "bytes" 6 | "encoding/json" 7 | "fmt" 8 | "log" 9 | "os" 10 | "strings" 11 | "unicode" 12 | 13 | "io/ioutil" 14 | 15 | "github.com/restream/reindexer" 16 | _ "github.com/restream/reindexer/bindings/builtin" 17 | _ "github.com/restream/reindexer/pprof" 18 | ) 19 | 20 | type HabrComment struct { 21 | ID int `reindex:"id,,pk" json:"id"` 22 | PostID int `reindex:"post_id,,dense" json:"post_id"` 23 | Text string `reindex:"text,-,dense" json:"text"` 24 | User string `reindex:"user,-,dense" json:"user"` 25 | Time int64 `reindex:"time,-,dense" json:"time"` 26 | Likes int `reindex:"likes,-,dense" json:"likes,omitempty"` 27 | _ struct{} `reindex:"text+user=search,text,composite"` 28 | } 29 | 30 | type HabrPost struct { 31 | ID int `reindex:"id,tree,pk" json:"id"` 32 | Time int64 `reindex:"time,tree,dense" json:"time"` 33 | Text string `reindex:"text,-" json:"text"` 34 | Title string `reindex:"title,-" json:"title"` 35 | User string `reindex:"user" json:"user"` 36 | Hubs []string `reindex:"hubs" json:"hubs"` 37 | Tags []string `reindex:"tags" json:"tags"` 38 | Likes int `reindex:"likes,-,dense" json:"likes,omitempty"` 39 | Favorites int `reindex:"favorites,-,dense" json:"favorites,omitempty"` 40 | Views int `reindex:"views,-,dense" json:"views"` 41 | HasImage bool `json:"has_image,omitempty"` 42 | 43 | Comments []*HabrComment `reindex:"comments,,joined" json:"comments,omitempty"` 44 | _ struct{} `reindex:"title+text+user=search,text,composite"` 45 | } 46 | 47 | type FTConfig struct { 48 | Bm25Boost float64 `json:"bm25_boost"` 49 | Bm25Weight float64 `json:"bm25_weight"` 50 | DistanceBoost float64 `json:"distance_boost"` 51 | DistanceWeight float64 `json:"distance_weight"` 52 | TermLenBoost float64 `json:"term_len_boost"` 53 | TermLenWeight float64 `json:"term_len_weight"` 54 | MinRelevancy float64 `json:"min_relevancy"` 55 | Fields string `json:"fields"` 56 | } 57 | 58 | type RepoConfig struct { 59 | PostsFt FTConfig `json:"posts"` 60 | CommentsFt FTConfig `json:"comments"` 61 | } 62 | 63 | type Repo struct { 64 | db *reindexer.Reindexer 65 | cfg RepoConfig 66 | ready bool 67 | } 68 | 69 | func applyOffsetAndLimit(query *reindexer.Query, offset, limit int) { 70 | if limit != -1 { 71 | query.Limit(limit) 72 | } else { 73 | query.Limit(20) 74 | } 75 | 76 | if offset != -1 { 77 | query.Offset(offset) 78 | } 79 | } 80 | 81 | func textToReindexFullTextDSL(fields string, input string) string { 82 | var output, cur bytes.Buffer 83 | // Boost fields 84 | if len(fields) > 0 { 85 | output.WriteByte('@') 86 | output.WriteString(fields) 87 | output.WriteByte(' ') 88 | } 89 | 90 | interm := false 91 | term := 0 92 | termLen := 0 93 | 94 | // trim input spaces, and add trailing space 95 | input = strings.Trim(input, " ") + " " 96 | 97 | for _, r := range input { 98 | if (unicode.IsDigit(r) || unicode.IsLetter(r)) && !interm { 99 | cur.Reset() 100 | interm = true 101 | termLen = 0 102 | } 103 | 104 | if !unicode.IsDigit(r) && !unicode.IsLetter(r) && !strings.Contains("-+/", string(r)) && interm { 105 | 106 | if term > 0 { 107 | output.WriteByte('+') 108 | } 109 | switch { 110 | case termLen >= 3: 111 | // enable typos search from 3 symbols in term 112 | output.WriteString("*") 113 | output.Write(cur.Bytes()) 114 | output.WriteString("~*") 115 | case termLen >= 2: 116 | // enable prefix from 2 symbol or on 2-nd+ term 117 | output.Write(cur.Bytes()) 118 | output.WriteString("~*") 119 | default: 120 | output.Write(cur.Bytes()) 121 | } 122 | output.WriteByte(' ') 123 | interm = false 124 | term++ 125 | if term > 8 { 126 | break 127 | } 128 | } 129 | if interm { 130 | cur.WriteRune(r) 131 | termLen++ 132 | } 133 | } 134 | 135 | if termLen <= 2 && term == 1 { 136 | return "" 137 | } 138 | 139 | return output.String() 140 | } 141 | 142 | func (r *Repo) SearchPosts(text string, offset, limit int, sortBy string, sortDesc bool) ([]*HabrPost, int, error) { 143 | 144 | if !r.ready { 145 | return nil, 0, fmt.Errorf("repo is not ready") 146 | } 147 | 148 | query := repo.db.Query("posts"). 149 | Match("search", textToReindexFullTextDSL(r.cfg.PostsFt.Fields, text)). 150 | ReqTotal() 151 | 152 | query.Functions("text = snippet(,,30,30, ...,...
)") 153 | 154 | if len(sortBy) != 0 { 155 | query.Sort(sortBy, sortDesc) 156 | } 157 | 158 | applyOffsetAndLimit(query, offset, limit) 159 | 160 | it := query.Exec() 161 | defer it.Close() 162 | 163 | if err := it.Error(); err != nil { 164 | return nil, 0, err 165 | } 166 | 167 | items := make([]*HabrPost, 0, it.Count()) 168 | for it.Next() { 169 | item := it.Object() 170 | items = append(items, item.(*HabrPost)) 171 | } 172 | 173 | return items, it.TotalCount(), nil 174 | } 175 | 176 | func (r *Repo) GetPost(id int, withComments bool) (*HabrPost, error) { 177 | if !r.ready { 178 | return nil, fmt.Errorf("repo is not ready") 179 | } 180 | 181 | query := repo.db.Query("posts"). 182 | WhereInt("id", reindexer.EQ, id). 183 | ReqTotal() 184 | 185 | if withComments { 186 | query.Join(repo.db.Query("comments"), "comments").On("id", reindexer.EQ, "post_id") 187 | } 188 | 189 | it := query.Exec() 190 | defer it.Close() 191 | 192 | obj, err := it.FetchOne() 193 | 194 | if err != nil { 195 | return nil, err 196 | } 197 | 198 | return obj.(*HabrPost), nil 199 | } 200 | 201 | func (r *Repo) GetPosts(offset int, limit int, user string, startTime int, endTime int, withComments bool) ([]*HabrPost, int, error) { 202 | if !r.ready { 203 | return nil, 0, fmt.Errorf("repo is not ready") 204 | } 205 | 206 | query := repo.db.Query("posts"). 207 | ReqTotal() 208 | 209 | applyOffsetAndLimit(query, offset, limit) 210 | 211 | if startTime != -1 { 212 | query.WhereInt("time", reindexer.GE, startTime) 213 | } 214 | 215 | if endTime != -1 { 216 | query.WhereInt("time", reindexer.LE, endTime) 217 | } 218 | 219 | if len(user) > 0 { 220 | query.WhereString("user", reindexer.EQ, user) 221 | } 222 | 223 | if withComments { 224 | query.Join(repo.db.Query("comments"), "comments").On("id", reindexer.EQ, "post_id") 225 | } 226 | 227 | query.Sort("time", false) 228 | 229 | it := query.Exec() 230 | defer it.Close() 231 | 232 | if err := it.Error(); err != nil { 233 | return nil, 0, err 234 | } 235 | 236 | items := make([]*HabrPost, 0, it.Count()) 237 | for it.Next() { 238 | item := it.Object() 239 | items = append(items, item.(*HabrPost)) 240 | } 241 | 242 | return items, it.TotalCount(), nil 243 | } 244 | 245 | func (r *Repo) SearchComments(text string, offset, limit int, sortBy string, sortDesc bool) ([]*HabrComment, int, error) { 246 | if !r.ready { 247 | return nil, 0, fmt.Errorf("repo is not ready") 248 | } 249 | 250 | query := repo.db.Query("comments"). 251 | ReqTotal(). 252 | Match("search", textToReindexFullTextDSL(r.cfg.CommentsFt.Fields, text)) 253 | 254 | query.Functions("text = snippet(,,30,30, ...,...
)") 255 | 256 | if len(sortBy) != 0 { 257 | query.Sort(sortBy, sortDesc) 258 | } 259 | 260 | applyOffsetAndLimit(query, offset, limit) 261 | 262 | it := query.Exec() 263 | defer it.Close() 264 | 265 | if err := it.Error(); err != nil { 266 | return nil, 0, err 267 | } 268 | 269 | items := make([]*HabrComment, 0, it.Count()) 270 | for it.Next() { 271 | item := it.Object() 272 | items = append(items, item.(*HabrComment)) 273 | } 274 | 275 | return items, it.TotalCount(), nil 276 | } 277 | 278 | func (r *Repo) updatePostFromFile(filePath string) { 279 | jsonItem, err := ioutil.ReadFile(filePath) 280 | if err != nil { 281 | log.Printf("Error read file %s: %s\n", filePath, err.Error()) 282 | } 283 | post := HabrPost{} 284 | err = json.Unmarshal(jsonItem, &post) 285 | if err != nil { 286 | log.Printf("Error parse file %s: %s\n", filePath, err.Error()) 287 | } 288 | 289 | for _, comment := range post.Comments { 290 | comment.PostID = post.ID 291 | err = r.db.Upsert("comments", comment) 292 | if err != nil { 293 | log.Printf("Error upsert comment %d from file %s: %s\n", comment.ID, filePath, err.Error()) 294 | } 295 | } 296 | 297 | post.Comments = post.Comments[:0] 298 | err = r.db.Upsert("posts", post) 299 | if err != nil { 300 | log.Printf("Error upsert post from file %s: %s\n", filePath, err.Error()) 301 | } 302 | 303 | } 304 | 305 | func (r *Repo) RestoreAllFromFiles(path string) { 306 | files, err := ioutil.ReadDir(path) 307 | if err != nil { 308 | log.Fatal(err) 309 | } 310 | 311 | for i, f := range files { 312 | r.updatePostFromFile(path + "/" + f.Name()) 313 | if (i != 0 && (i%1000) == 0) || i == len(files)-1 { 314 | fmt.Printf("processed %d files (from %d)\n", i+1, len(files)) 315 | } 316 | } 317 | } 318 | 319 | func (r *Repo) RestoreRangeFromFiles(path string, startID, finishID int) { 320 | 321 | cnt := 0 322 | for id := startID; id < finishID; id++ { 323 | fileName := fmt.Sprintf("%s/%d.json", path, id) 324 | if _, err := os.Stat(fileName); err == nil { 325 | r.updatePostFromFile(fileName) 326 | cnt++ 327 | } 328 | } 329 | fmt.Printf("processed %d files\n", cnt+1) 330 | } 331 | 332 | func (r *Repo) setFTConfig(ns string, newCfg FTConfig) error { 333 | 334 | cfg := reindexer.DefaultFtFastConfig() 335 | cfg.MaxTyposInWord = 1 336 | cfg.LogLevel = reindexer.INFO 337 | cfg.Bm25Boost = newCfg.Bm25Boost 338 | cfg.Bm25Weight = newCfg.Bm25Weight 339 | cfg.DistanceBoost = newCfg.DistanceBoost 340 | cfg.DistanceWeight = newCfg.DistanceWeight 341 | cfg.MinRelevancy = newCfg.MinRelevancy 342 | 343 | err := r.db.ConfigureIndex(ns, "search", cfg) 344 | 345 | if err != nil { 346 | return err 347 | } 348 | 349 | switch ns { 350 | case "posts": 351 | r.cfg.PostsFt = newCfg 352 | case "comments": 353 | r.cfg.CommentsFt = newCfg 354 | default: 355 | return fmt.Errorf("Unknown namespace %s", ns) 356 | } 357 | return nil 358 | } 359 | 360 | func (r *Repo) SetFTConfig(ns string, newCfg FTConfig) error { 361 | err := r.setFTConfig(ns, newCfg) 362 | if err != nil { 363 | return err 364 | } 365 | data, err := json.Marshal(r.cfg) 366 | if err != nil { 367 | return err 368 | } 369 | return ioutil.WriteFile("repo.cfg", data, 0666) 370 | } 371 | 372 | func (r *Repo) Init() { 373 | 374 | if r.db == nil { 375 | r.db = reindexer.NewReindex("builtin:///var/lib/reindexer/habr") 376 | r.db.SetLogger(logger) 377 | } 378 | cfgFile, err := ioutil.ReadFile("repo.cfg") 379 | newCfg := RepoConfig{} 380 | 381 | if err != nil { 382 | err = json.Unmarshal(cfgFile, &newCfg) 383 | } 384 | 385 | if err != nil { 386 | 387 | newCfg.PostsFt = FTConfig{ 388 | Bm25Boost: 0.1, 389 | Bm25Weight: 0.3, 390 | DistanceBoost: 2.0, 391 | DistanceWeight: 0.5, 392 | MinRelevancy: 0.2, 393 | Fields: "*^0.4,user^1.0,title^1.6", 394 | } 395 | newCfg.CommentsFt = FTConfig{ 396 | Bm25Boost: 0.1, 397 | Bm25Weight: 0.3, 398 | DistanceBoost: 2.0, 399 | DistanceWeight: 0.5, 400 | MinRelevancy: 0.2, 401 | Fields: "", 402 | } 403 | } 404 | // cfg.StopWords = []string{"делать", "работать", "например", "получить", "данные", "стоит", "имеет", "компании", "случае", "код", "образом", "возможность", "работает", "свой", "т", "данных", 405 | // "сделать", "0", "позволяет", "помощью", "сразу", "4", "3", "6", "момент", "таким", "работы", "2", "использовать", 406 | // "с", "достаточно", "является", "часть", "10", "поэтому", "количество"} 407 | 408 | if err = r.db.OpenNamespace("comments", reindexer.DefaultNamespaceOptions(), HabrComment{}); err != nil { 409 | panic(err) 410 | } 411 | if err = r.setFTConfig("comments", newCfg.CommentsFt); err != nil { 412 | panic(err) 413 | } 414 | 415 | if err = r.db.OpenNamespace("posts", reindexer.DefaultNamespaceOptions(), HabrPost{}); err != nil { 416 | panic(err) 417 | } 418 | if err = r.setFTConfig("posts", newCfg.PostsFt); err != nil { 419 | panic(err) 420 | } 421 | repo.WarmUp() 422 | } 423 | 424 | func (r *Repo) WarmUp() { 425 | it := r.db.Query("comments").Where("search", reindexer.EQ, "").Exec() 426 | if it.Error() != nil { 427 | log.Print(it.Error().Error()) 428 | } 429 | it.Close() 430 | it = r.db.Query("posts").Where("search", reindexer.EQ, "").Exec() 431 | if it.Error() != nil { 432 | log.Print(it.Error().Error()) 433 | } 434 | r.ready = true 435 | it.Close() 436 | } 437 | 438 | func (r *Repo) Done() { 439 | r.ready = false 440 | r.db.CloseNamespace("posts") 441 | r.db.CloseNamespace("comments") 442 | } 443 | 444 | type Logger struct { 445 | } 446 | 447 | func (l *Logger) Printf(level int, format string, msg ...interface{}) { 448 | if level <= reindexer.TRACE { 449 | log.Printf(format, msg...) 450 | } 451 | } 452 | 453 | var logger = &Logger{} 454 | --------------------------------------------------------------------------------