├── .gitignore ├── README.md ├── db ├── data.db └── db.go ├── generator └── generator.go ├── go.mod ├── go.sum ├── sayhuuzoku.go ├── scraping ├── scraping.go ├── shopdic.txt └── shoplist.txt └── wakati └── wakati.go /.gitignore: -------------------------------------------------------------------------------- 1 | vendor/ 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # say-huuzoku 2 | 3 | ## Description 4 | The sayhuuzoku is a library to generate a shop name like 風俗店(huuzoku-shop). 5 | 6 | ## Example 7 | ``` 8 | $ go run sayhuuzoku.go generate -c 2 9 | 月淫乱 10 | 11 | $ go run sayhuuzoku.go generate -c 3 12 | セレブサークル月ちゃり 13 | 14 | $ go run sayhuuzoku.go generate -c 4 15 | INO-遊園PLAYSTAGE 16 | ``` 17 | 18 | ## Installation 19 | ``` 20 | go get -u github.com/YuheiNakasaka/sayhuuzoku 21 | ``` 22 | 23 | ## Usage 24 | 25 | ``` 26 | $ sayhuuzoku h 27 | NAME: 28 | sayhuuzoku - A new cli application to generate a shop name like 風俗店(huuzoku-shop). 29 | 30 | USAGE: 31 | sayhuuzoku [global options] command [command options] [arguments...] 32 | 33 | VERSION: 34 | 0.0.1 35 | 36 | COMMANDS: 37 | init, i Init database 38 | scraping, s Fetch shop name from http://fujoho.jp/index.php?p=shop_list 39 | wakati, w Create wakati data from shoplist file 40 | generate, g Generate shop name like huuzoku (default: 4 words) 41 | help, h Shows a list of commands or help for one command 42 | 43 | GLOBAL OPTIONS: 44 | --help, -h show help 45 | --version, -v print the version 46 | ``` 47 | 48 | ## License 49 | The library is available as open source under the terms of the MIT License. 50 | -------------------------------------------------------------------------------- /db/data.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YuheiNakasaka/sayhuuzoku/a2f148bcb9f7b587a1893682980793fc65812834/db/data.db -------------------------------------------------------------------------------- /db/db.go: -------------------------------------------------------------------------------- 1 | package db 2 | 3 | import ( 4 | "database/sql" 5 | "fmt" 6 | "os" 7 | "os/exec" 8 | "path/filepath" 9 | "strings" 10 | 11 | _ "github.com/mattn/go-sqlite3" 12 | ) 13 | 14 | // MyDB : db struct 15 | type MyDB struct { 16 | Connection *sql.DB 17 | } 18 | 19 | // InitDB : initialize database 20 | var InitDB = false 21 | 22 | // New : create db and keep connection 23 | func (mydb *MyDB) New() error { 24 | b, err := exec.Command("go", "env", "GOPATH").CombinedOutput() 25 | if err != nil { 26 | return fmt.Errorf("Failed to find path to database: %v", err) 27 | } 28 | dbFile := "" 29 | for _, p := range filepath.SplitList(strings.TrimSpace(string(b))) { 30 | p = filepath.Join(p, filepath.FromSlash("/src/github.com/YuheiNakasaka/sayhuuzoku/db/data.db")) 31 | if _, err = os.Stat(p); err == nil { 32 | dbFile = p 33 | break 34 | } 35 | } 36 | if dbFile == "" { 37 | return fmt.Errorf("Failed to find path to database: %v", err) 38 | } 39 | 40 | if InitDB == true { 41 | os.Remove(dbFile) 42 | } 43 | 44 | db, err := sql.Open("sqlite3", dbFile) 45 | if err != nil { 46 | return fmt.Errorf("Failed to open database: %v", err) 47 | } 48 | mydb.Connection = db 49 | 50 | q := "CREATE TABLE IF NOT EXISTS wakati_shopname (" 51 | q += " id INTEGER PRIMARY KEY AUTOINCREMENT" 52 | q += ", word VARCHAR(255) NOT NULL" 53 | q += ", position INTEGER NOT NULL" 54 | q += ", created_at TIMESTAMP DEFAULT (DATETIME('now','localtime'))" 55 | q += ")" 56 | 57 | _, err = db.Exec(q) 58 | return err 59 | } 60 | -------------------------------------------------------------------------------- /generator/generator.go: -------------------------------------------------------------------------------- 1 | package generator 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "strings" 7 | "time" 8 | 9 | "github.com/YuheiNakasaka/sayhuuzoku/db" 10 | ) 11 | 12 | // Start : generate shop name 13 | func Start(total int) (string, error) { 14 | if total < 1 { 15 | return "", errors.New("Total count is more than 1") 16 | } 17 | 18 | mydb := &db.MyDB{} 19 | err := mydb.New() 20 | if err != nil { 21 | return "", fmt.Errorf("Failed to connect db: %v", err) 22 | } 23 | defer mydb.Connection.Close() 24 | 25 | words := make([]string, 0, 0) 26 | for i := 1; i <= total; i++ { 27 | query := fmt.Sprintf("select * from wakati_shopname where length(word) > 1 and position = %d group by word order by random() limit 1;", i) 28 | rows, qerr := mydb.Connection.Query(query) 29 | if qerr != nil { 30 | return "", fmt.Errorf("Failed to execute query: %v", err) 31 | } 32 | 33 | for rows.Next() { 34 | var id int 35 | var word string 36 | var position int 37 | var createdAt time.Time 38 | serr := rows.Scan(&id, &word, &position, &createdAt) 39 | if serr != nil { 40 | rows.Close() 41 | return "", fmt.Errorf("Failed to fetch row: %v", err) 42 | } 43 | words = append(words, word) 44 | } 45 | rows.Close() 46 | } 47 | return strings.Join(words, ""), err 48 | } 49 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/YuheiNakasaka/sayhuuzoku 2 | 3 | go 1.16 4 | 5 | require ( 6 | github.com/PuerkitoBio/goquery v1.3.0 7 | github.com/andybalholm/cascadia v1.0.0 // indirect 8 | github.com/ikawaha/kagome.ipadic v1.1.0 9 | github.com/mattn/go-sqlite3 v1.6.0 10 | github.com/urfave/cli v1.20.0 11 | golang.org/x/net v0.0.0-20180308154319-d0aafc73d5cd // indirect 12 | ) 13 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/PuerkitoBio/goquery v1.3.0 h1:2LzdaeRwZjIMW7iKEei51jiCPB33mou4AI7QCzS4NgE= 2 | github.com/PuerkitoBio/goquery v1.3.0/go.mod h1:T9ezsOHcCrDCgA8aF1Cqr3sSYbO/xgdy8/R/XiIMAhA= 3 | github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o= 4 | github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= 5 | github.com/ikawaha/kagome.ipadic v1.1.0 h1:9hzwhcklEL4Cmp+lM9HQfmDg2nhB43Fe1n9UUY6mifY= 6 | github.com/ikawaha/kagome.ipadic v1.1.0/go.mod h1:DPSBbU0czaJhAb/5uKQZHMc9MTVRpDugJfX+HddPHHg= 7 | github.com/mattn/go-sqlite3 v1.6.0 h1:TDwTWbeII+88Qy55nWlof0DclgAtI4LqGujkYMzmQII= 8 | github.com/mattn/go-sqlite3 v1.6.0/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsOqkbpncsNc= 9 | github.com/urfave/cli v1.20.0 h1:fDqGv3UG/4jbVl/QkFwEdddtEDjh/5Ov6X+0B/3bPaw= 10 | github.com/urfave/cli v1.20.0/go.mod h1:70zkFmudgCuE/ngEzBv17Jvp/497gISqfk5gWijbERA= 11 | golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 12 | golang.org/x/net v0.0.0-20180308154319-d0aafc73d5cd h1:GNzbLJRy/nHOFS5m5780xbL4nia5w6cyb8exQGYw3Z4= 13 | golang.org/x/net v0.0.0-20180308154319-d0aafc73d5cd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 14 | -------------------------------------------------------------------------------- /sayhuuzoku.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | 8 | "github.com/YuheiNakasaka/sayhuuzoku/db" 9 | "github.com/YuheiNakasaka/sayhuuzoku/generator" 10 | "github.com/YuheiNakasaka/sayhuuzoku/scraping" 11 | "github.com/YuheiNakasaka/sayhuuzoku/wakati" 12 | "github.com/urfave/cli" 13 | ) 14 | 15 | func main() { 16 | app := cli.NewApp() 17 | app.Name = "sayhuuzoku" 18 | app.Version = "0.0.1" 19 | app.Usage = " A new cli application to generate a shop name like 風俗店(huuzoku-shop)." 20 | 21 | app.Commands = []cli.Command{ 22 | { 23 | Name: "init", 24 | Aliases: []string{"i"}, 25 | Usage: "Init database", 26 | Action: func(c *cli.Context) error { 27 | mydb := db.MyDB{} 28 | return mydb.New() 29 | }, 30 | }, 31 | { 32 | Name: "scraping", 33 | Aliases: []string{"s"}, 34 | Usage: "Fetch shop name from http://fujoho.jp/index.php?p=shop_list", 35 | Flags: []cli.Flag{ 36 | cli.IntFlag{ 37 | Name: "max-page, mp", 38 | Usage: "max page of scraping site", 39 | }, 40 | }, 41 | Action: func(c *cli.Context) error { 42 | return scraping.Start(c.Int("max-page")) 43 | }, 44 | }, 45 | { 46 | Name: "wakati", 47 | Aliases: []string{"w"}, 48 | Usage: "Create wakati data from shoplist file", 49 | Action: func(c *cli.Context) error { 50 | return wakati.Start() 51 | }, 52 | }, 53 | { 54 | Name: "generate", 55 | Aliases: []string{"g"}, 56 | Usage: "Generate shop name like huuzoku (default: 4 words)", 57 | Flags: []cli.Flag{ 58 | cli.IntFlag{ 59 | Name: "count, c", 60 | Value: 4, 61 | Usage: "word count", 62 | }, 63 | }, 64 | Action: func(c *cli.Context) error { 65 | shopName, err := generator.Start(c.Int("count")) 66 | if err != nil { 67 | return err 68 | } 69 | fmt.Println(shopName) 70 | return nil 71 | }, 72 | }, 73 | } 74 | 75 | err := app.Run(os.Args) 76 | if err != nil { 77 | log.Fatal(err) 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /scraping/scraping.go: -------------------------------------------------------------------------------- 1 | package scraping 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "path/filepath" 7 | "regexp" 8 | "strconv" 9 | "strings" 10 | "sync" 11 | "time" 12 | 13 | "github.com/PuerkitoBio/goquery" 14 | ) 15 | 16 | // ShopListURL : target url 17 | var ShopListURL = "http://fujoho.jp/index.php?p=shop_list&b=" 18 | 19 | // ShopNameFile : shop name file name 20 | var ShopNameFile = "/scraping/shoplist.txt" 21 | 22 | // ShopDicFile : shop dictionary file name 23 | var ShopDicFile = "/scraping/shopdic.txt" 24 | 25 | // Start : fetch page and get names 26 | func Start(maxPage int) error { 27 | fmt.Println("Start scraping") 28 | 29 | absDir, err := filepath.Abs(filepath.Dir(".")) 30 | if err != nil { 31 | return fmt.Errorf("Failed to open file: %v", err) 32 | } 33 | 34 | // 店名をリストにするファイル 35 | file, err := os.Create(filepath.Join(absDir, filepath.FromSlash(ShopNameFile))) 36 | if err != nil { 37 | return fmt.Errorf("Failed to create file: %v", err) 38 | } 39 | defer file.Close() 40 | 41 | // 並列取得する 42 | maxConnection := make(chan bool, 5) 43 | wg := &sync.WaitGroup{} 44 | mu := &sync.Mutex{} 45 | 46 | for i := 0; i < maxPage; i++ { 47 | wg.Add(1) 48 | maxConnection <- true 49 | 50 | go func(page int, mu *sync.Mutex) { 51 | defer wg.Done() 52 | time.Sleep(2 * time.Second) // 2秒待つ 53 | 54 | // goqueryでHTML取得 55 | url := ShopListURL + strconv.Itoa(page) 56 | doc, scrapingErr := goquery.NewDocument(url) 57 | if scrapingErr != nil { 58 | err = fmt.Errorf("Failed to scrape: %v", scrapingErr) 59 | return 60 | } 61 | fmt.Println("Scraping: " + url) 62 | 63 | // 店舗の載ってる範囲をチェック 64 | if checkValidSite(doc) { 65 | // 店名を抜き出してファイルに書き出す 66 | fetchShopName(doc, file, mu) 67 | } 68 | 69 | <-maxConnection 70 | }(i, mu) 71 | } 72 | wg.Wait() 73 | fmt.Println("Finish") 74 | return err 75 | } 76 | 77 | func checkValidSite(doc *goquery.Document) bool { 78 | var t string 79 | doc.Find(".data-nothing").Each(func(_ int, s *goquery.Selection) { 80 | t = s.Text() 81 | fmt.Println(t) 82 | }) 83 | if t == "" { 84 | return true 85 | } 86 | return false 87 | } 88 | 89 | func fetchShopName(doc *goquery.Document, file *os.File, mu *sync.Mutex) { 90 | mu.Lock() 91 | defer mu.Unlock() 92 | doc.Find(".shop-name").Each(func(i int, s *goquery.Selection) { 93 | shopName := s.Text() 94 | // 新店舗のmarkは除く 95 | shopName = strings.Replace(shopName, "New!", "", 1) 96 | // カッコの補足は除く 97 | rep1 := regexp.MustCompile(`[\(|(].+[\)|)]`) 98 | shopName = rep1.ReplaceAllString(shopName, "") 99 | 100 | // ファイルに書き込む 101 | file.Write(([]byte)(shopName + "\n")) 102 | // fmt.Printf("Result %d: %s\n", i, shopName) 103 | }) 104 | } 105 | -------------------------------------------------------------------------------- /scraping/shopdic.txt: -------------------------------------------------------------------------------- 1 | にゃんだ,にゃんだ,ニャンダ,カスタム名詞 2 | あね,あね,アネ,カスタム名詞 3 | こき,こき,コキ,カスタム動詞 4 | ぽっちゃり,ぽっちゃり,ポッチャリ,カスタム名詞 5 | きゃらめる,きゃらめる,キャラメル,カスタム名詞 6 | ぽっぷこーん,ぽっぷこーん,ポップコーン,カスタム名詞 7 | くりぃむ,くりぃむ,クリィム,カスタム名詞 8 | れもん,れもん,レモン,カスタム名詞 9 | たすてぃっく,たすてぃっく,タスティック,カスタム名詞 10 | ふぃんがー,ふぃんがー,フィンガー,カスタム名詞 11 | ごっど,ごっど,ゴッド,カスタム名詞 12 | らんでぇぶぅ,らんでぇぶぅ,ランデェブゥ,カスタム名詞 13 | ふぇら,ふぇら,フェラ,カスタム名詞 14 | みるきぃ,みるきぃ,ミルキィ,カスタム名詞 15 | すぅぱぁ,すぅぱぁ,スゥパァ,カスタム名詞 16 | まぁめいど,まぁめいど,マァメイド,カスタム名詞 17 | らいくぁ,らいくぁ,ライクァ,カスタム名詞 18 | ばーじん,ばーじん,バージン,カスタム名詞 19 | みるく,みるく,ミルク,カスタム名詞 20 | しぇいく,しぇいく,シェイク,カスタム名詞 21 | えんじぇる,えんじぇる,エンジェル,カスタム名詞 22 | まみぃ,まみぃ,マミィ,カスタム人名 23 | くらぶ,くらぶ,クラブ,カスタム名詞 24 | -------------------------------------------------------------------------------- /wakati/wakati.go: -------------------------------------------------------------------------------- 1 | package wakati 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "os" 7 | "path/filepath" 8 | "runtime" 9 | "strings" 10 | "sync" 11 | "time" 12 | 13 | "github.com/YuheiNakasaka/sayhuuzoku/db" 14 | "github.com/YuheiNakasaka/sayhuuzoku/scraping" 15 | "github.com/ikawaha/kagome.ipadic/tokenizer" 16 | ) 17 | 18 | // MyToken : token struct 19 | type MyToken struct { 20 | text string 21 | pos int 22 | } 23 | 24 | // Start : create wakati file 25 | func Start() error { 26 | start := time.Now() 27 | runtime.GOMAXPROCS(runtime.NumCPU()) 28 | 29 | // wakti ファイルを開く 30 | db.InitDB = true // dbファイルを初期化する 31 | mydb := &db.MyDB{} 32 | err := mydb.New() 33 | if err != nil { 34 | return fmt.Errorf("Failed to open db: %v", err) 35 | } 36 | defer mydb.Connection.Close() 37 | 38 | // ファイルを1行ずつ読み込む準備 39 | absDir, err := filepath.Abs(filepath.Dir(".")) 40 | if err != nil { 41 | return fmt.Errorf("Failed to open file: %v", err) 42 | } 43 | file, err := os.Open(filepath.Join(absDir, filepath.FromSlash(scraping.ShopNameFile))) 44 | if err != nil { 45 | return fmt.Errorf("Failed to open shop name file: %v", err) 46 | } 47 | defer file.Close() 48 | scanner := bufio.NewScanner(file) 49 | 50 | // 読みこんだテキストを貯めるチャンネルと 51 | // token処理した単語を送るチャンネル 52 | wg := &sync.WaitGroup{} 53 | lines := make(chan string) 54 | values := make(chan MyToken) 55 | 56 | // テキストを処理 57 | t := tokenizer.New() 58 | 59 | dic, err := os.Open(absDir + scraping.ShopDicFile) 60 | if err != nil { 61 | return fmt.Errorf("Failed to open user dictionary file: %v", err) 62 | } 63 | defer dic.Close() 64 | userDicRec, err := tokenizer.NewUserDicRecords(dic) 65 | if err != nil { 66 | return fmt.Errorf("Failed to create user dictionary record: %v", err) 67 | } 68 | userDic, err := userDicRec.NewUserDic() 69 | if err != nil { 70 | return fmt.Errorf("Failed to create user dictionary: %v", err) 71 | } 72 | t.SetUserDic(userDic) 73 | for j := 0; j < 100; j++ { 74 | wg.Add(1) 75 | go func() { 76 | defer wg.Done() 77 | for l := range lines { 78 | cnt := 1 // 除いたtoken分詰めたposition 79 | tokens := t.Tokenize(l) 80 | for _, token := range tokens { 81 | if token.Class == tokenizer.DUMMY { 82 | continue 83 | } 84 | s, nerr := normalize(token) 85 | if nerr != nil { 86 | continue 87 | } 88 | if len(s) > 0 { 89 | mytoken := MyToken{} 90 | mytoken.text = s 91 | mytoken.pos = cnt 92 | values <- mytoken 93 | cnt++ 94 | } 95 | } 96 | } 97 | }() 98 | } 99 | 100 | // fileをガッと読む 101 | go func() { 102 | for scanner.Scan() { 103 | text := scanner.Text() 104 | // 最後の行 105 | if text == "\n" || text == "" || text == " " { 106 | return 107 | } 108 | lines <- text 109 | } 110 | close(lines) 111 | }() 112 | 113 | go func() { 114 | wg.Wait() 115 | close(values) 116 | }() 117 | 118 | // 多くしすぎるとtoo many sql variablesのエラーが出るのでぎりぎりまで 119 | mu := &sync.Mutex{} 120 | valueQueue := make([]MyToken, 0, 0) 121 | for v := range values { 122 | valueQueue = append(valueQueue, v) 123 | if len(valueQueue) == 200 { 124 | if err := writeMutex(valueQueue, mydb, mu); err != nil { 125 | return err 126 | } 127 | valueQueue = make([]MyToken, 0, 0) 128 | } 129 | } 130 | writeMutex(valueQueue, mydb, mu) 131 | 132 | end := time.Now() 133 | fmt.Printf("Finish: %f秒\n", (end.Sub(start)).Seconds()) 134 | return nil 135 | } 136 | 137 | // bulk insertする 138 | func writeMutex(values []MyToken, mydb *db.MyDB, mu *sync.Mutex) error { 139 | mu.Lock() 140 | defer mu.Unlock() 141 | 142 | // valuesをbulk insert用のクエリにする 143 | valStr := make([]string, 0, 0) 144 | valArgs := make([]interface{}, 0, 0) 145 | for _, val := range values { 146 | if val.text == " " { 147 | continue 148 | } 149 | valStr = append(valStr, "(?, ?)") 150 | valArgs = append(valArgs, val.text) 151 | valArgs = append(valArgs, val.pos) 152 | } 153 | query := fmt.Sprintf("INSERT INTO wakati_shopname(word, position) values %s", strings.Join(valStr, ",")) 154 | 155 | stmt, err := mydb.Connection.Prepare(query) 156 | if err != nil { 157 | return fmt.Errorf("Error occured in stmt: %v", err) 158 | } 159 | 160 | _, err = stmt.Exec(valArgs...) 161 | if err != nil { 162 | return fmt.Errorf("Error occured in exec: %v", err) 163 | } 164 | return nil 165 | } 166 | 167 | // 店名の正規化をする 168 | func normalize(token tokenizer.Token) (string, error) { 169 | var err error 170 | s := strings.TrimSpace(token.Surface) 171 | features := token.Features() 172 | 173 | for _, f := range features { 174 | if f == "空白" || f == "助詞" || f == "助動詞" || f == "サ変接続" || f == "括弧開" || f == "括弧閉" || f == "句点" || f == "地域" { 175 | err = fmt.Errorf("Invalid style word: %s", f) 176 | return "", err 177 | } 178 | if s == "-" || s == "~" || s == "~" || s == "ー" || s == "店" || 179 | s == "." || s == "!" || s == "・" || s == "っ" || s == "s" || s == "ぽ" || 180 | s == "…" || s == "?" || s == "、" || s == "倶楽部" || s == "club" || s == "CLUB" || 181 | s == "クラブ" || s == "Club" || s == "&" || s == "☆" || s == "お" { 182 | err = fmt.Errorf("Stop word: %s", f) 183 | return "", err 184 | } 185 | if f == "動詞" && len(s) == 1 { 186 | err = fmt.Errorf("Unusal word: %s %s", f, s) 187 | return "", err 188 | } 189 | if f != "名詞" && len(s) == 1 { 190 | err = fmt.Errorf("Unusal word: %s %s", f, s) 191 | return "", err 192 | } 193 | } 194 | //fmt.Println(s, token.Features()) 195 | return s, err 196 | } 197 | --------------------------------------------------------------------------------