You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

elastic_search.go 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390
  1. // Copyright 2020 The Gitea Authors. All rights reserved.
  2. // Use of this source code is governed by a MIT-style
  3. // license that can be found in the LICENSE file.
  4. package code
  5. import (
  6. "context"
  7. "encoding/json"
  8. "fmt"
  9. "strconv"
  10. "strings"
  11. "time"
  12. "code.gitea.io/gitea/models"
  13. "code.gitea.io/gitea/modules/analyze"
  14. "code.gitea.io/gitea/modules/base"
  15. "code.gitea.io/gitea/modules/charset"
  16. "code.gitea.io/gitea/modules/git"
  17. "code.gitea.io/gitea/modules/log"
  18. "code.gitea.io/gitea/modules/setting"
  19. "code.gitea.io/gitea/modules/timeutil"
  20. "github.com/go-enry/go-enry/v2"
  21. "github.com/olivere/elastic/v7"
  22. )
  23. const (
  24. esRepoIndexerLatestVersion = 1
  25. )
  26. var (
  27. _ Indexer = &ElasticSearchIndexer{}
  28. )
  29. // ElasticSearchIndexer implements Indexer interface
  30. type ElasticSearchIndexer struct {
  31. client *elastic.Client
  32. indexerAliasName string
  33. }
  34. type elasticLogger struct {
  35. *log.Logger
  36. }
  37. func (l elasticLogger) Printf(format string, args ...interface{}) {
  38. _ = l.Logger.Log(2, l.Logger.GetLevel(), format, args...)
  39. }
  40. // NewElasticSearchIndexer creates a new elasticsearch indexer
  41. func NewElasticSearchIndexer(url, indexerName string) (*ElasticSearchIndexer, bool, error) {
  42. opts := []elastic.ClientOptionFunc{
  43. elastic.SetURL(url),
  44. elastic.SetSniff(false),
  45. elastic.SetHealthcheckInterval(10 * time.Second),
  46. elastic.SetGzip(false),
  47. }
  48. logger := elasticLogger{log.GetLogger(log.DEFAULT)}
  49. if logger.GetLevel() == log.TRACE || logger.GetLevel() == log.DEBUG {
  50. opts = append(opts, elastic.SetTraceLog(logger))
  51. } else if logger.GetLevel() == log.ERROR || logger.GetLevel() == log.CRITICAL || logger.GetLevel() == log.FATAL {
  52. opts = append(opts, elastic.SetErrorLog(logger))
  53. } else if logger.GetLevel() == log.INFO || logger.GetLevel() == log.WARN {
  54. opts = append(opts, elastic.SetInfoLog(logger))
  55. }
  56. client, err := elastic.NewClient(opts...)
  57. if err != nil {
  58. return nil, false, err
  59. }
  60. indexer := &ElasticSearchIndexer{
  61. client: client,
  62. indexerAliasName: indexerName,
  63. }
  64. exists, err := indexer.init()
  65. return indexer, !exists, err
  66. }
  67. const (
  68. defaultMapping = `{
  69. "mappings": {
  70. "properties": {
  71. "repo_id": {
  72. "type": "long",
  73. "index": true
  74. },
  75. "content": {
  76. "type": "text",
  77. "index": true
  78. },
  79. "commit_id": {
  80. "type": "keyword",
  81. "index": true
  82. },
  83. "language": {
  84. "type": "keyword",
  85. "index": true
  86. },
  87. "updated_at": {
  88. "type": "long",
  89. "index": true
  90. }
  91. }
  92. }
  93. }`
  94. )
  95. func (b *ElasticSearchIndexer) realIndexerName() string {
  96. return fmt.Sprintf("%s.v%d", b.indexerAliasName, esRepoIndexerLatestVersion)
  97. }
  98. // Init will initialize the indexer
  99. func (b *ElasticSearchIndexer) init() (bool, error) {
  100. ctx := context.Background()
  101. exists, err := b.client.IndexExists(b.realIndexerName()).Do(ctx)
  102. if err != nil {
  103. return false, err
  104. }
  105. if !exists {
  106. var mapping = defaultMapping
  107. createIndex, err := b.client.CreateIndex(b.realIndexerName()).BodyString(mapping).Do(ctx)
  108. if err != nil {
  109. return false, err
  110. }
  111. if !createIndex.Acknowledged {
  112. return false, fmt.Errorf("create index %s with %s failed", b.realIndexerName(), mapping)
  113. }
  114. }
  115. // check version
  116. r, err := b.client.Aliases().Do(ctx)
  117. if err != nil {
  118. return false, err
  119. }
  120. realIndexerNames := r.IndicesByAlias(b.indexerAliasName)
  121. if len(realIndexerNames) < 1 {
  122. res, err := b.client.Alias().
  123. Add(b.realIndexerName(), b.indexerAliasName).
  124. Do(ctx)
  125. if err != nil {
  126. return false, err
  127. }
  128. if !res.Acknowledged {
  129. return false, fmt.Errorf("")
  130. }
  131. } else if len(realIndexerNames) >= 1 && realIndexerNames[0] < b.realIndexerName() {
  132. log.Warn("Found older gitea indexer named %s, but we will create a new one %s and keep the old NOT DELETED. You can delete the old version after the upgrade succeed.",
  133. realIndexerNames[0], b.realIndexerName())
  134. res, err := b.client.Alias().
  135. Remove(realIndexerNames[0], b.indexerAliasName).
  136. Add(b.realIndexerName(), b.indexerAliasName).
  137. Do(ctx)
  138. if err != nil {
  139. return false, err
  140. }
  141. if !res.Acknowledged {
  142. return false, fmt.Errorf("")
  143. }
  144. }
  145. return exists, nil
  146. }
  147. func (b *ElasticSearchIndexer) addUpdate(sha string, update fileUpdate, repo *models.Repository) ([]elastic.BulkableRequest, error) {
  148. // Ignore vendored files in code search
  149. if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) {
  150. return nil, nil
  151. }
  152. stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha).
  153. RunInDir(repo.RepoPath())
  154. if err != nil {
  155. return nil, err
  156. }
  157. if size, err := strconv.Atoi(strings.TrimSpace(stdout)); err != nil {
  158. return nil, fmt.Errorf("Misformatted git cat-file output: %v", err)
  159. } else if int64(size) > setting.Indexer.MaxIndexerFileSize {
  160. return []elastic.BulkableRequest{b.addDelete(update.Filename, repo)}, nil
  161. }
  162. fileContents, err := git.NewCommand("cat-file", "blob", update.BlobSha).
  163. RunInDirBytes(repo.RepoPath())
  164. if err != nil {
  165. return nil, err
  166. } else if !base.IsTextFile(fileContents) {
  167. // FIXME: UTF-16 files will probably fail here
  168. return nil, nil
  169. }
  170. id := filenameIndexerID(repo.ID, update.Filename)
  171. return []elastic.BulkableRequest{
  172. elastic.NewBulkIndexRequest().
  173. Index(b.indexerAliasName).
  174. Id(id).
  175. Doc(map[string]interface{}{
  176. "repo_id": repo.ID,
  177. "content": string(charset.ToUTF8DropErrors(fileContents)),
  178. "commit_id": sha,
  179. "language": analyze.GetCodeLanguage(update.Filename, fileContents),
  180. "updated_at": timeutil.TimeStampNow(),
  181. }),
  182. }, nil
  183. }
  184. func (b *ElasticSearchIndexer) addDelete(filename string, repo *models.Repository) elastic.BulkableRequest {
  185. id := filenameIndexerID(repo.ID, filename)
  186. return elastic.NewBulkDeleteRequest().
  187. Index(b.indexerAliasName).
  188. Id(id)
  189. }
  190. // Index will save the index data
  191. func (b *ElasticSearchIndexer) Index(repo *models.Repository, sha string, changes *repoChanges) error {
  192. reqs := make([]elastic.BulkableRequest, 0)
  193. for _, update := range changes.Updates {
  194. updateReqs, err := b.addUpdate(sha, update, repo)
  195. if err != nil {
  196. return err
  197. }
  198. if len(updateReqs) > 0 {
  199. reqs = append(reqs, updateReqs...)
  200. }
  201. }
  202. for _, filename := range changes.RemovedFilenames {
  203. reqs = append(reqs, b.addDelete(filename, repo))
  204. }
  205. if len(reqs) > 0 {
  206. _, err := b.client.Bulk().
  207. Index(b.indexerAliasName).
  208. Add(reqs...).
  209. Do(context.Background())
  210. return err
  211. }
  212. return nil
  213. }
  214. // Delete deletes indexes by ids
  215. func (b *ElasticSearchIndexer) Delete(repoID int64) error {
  216. _, err := b.client.DeleteByQuery(b.indexerAliasName).
  217. Query(elastic.NewTermsQuery("repo_id", repoID)).
  218. Do(context.Background())
  219. return err
  220. }
  221. func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error) {
  222. hits := make([]*SearchResult, 0, pageSize)
  223. for _, hit := range searchResult.Hits.Hits {
  224. // FIXME: There is no way to get the position the keyword on the content currently on the same request.
  225. // So we get it from content, this may made the query slower. See
  226. // https://discuss.elastic.co/t/fetching-position-of-keyword-in-matched-document/94291
  227. var startIndex, endIndex int = -1, -1
  228. c, ok := hit.Highlight["content"]
  229. if ok && len(c) > 0 {
  230. var subStr = make([]rune, 0, len(kw))
  231. startIndex = strings.IndexFunc(c[0], func(r rune) bool {
  232. if len(subStr) >= len(kw) {
  233. subStr = subStr[1:]
  234. }
  235. subStr = append(subStr, r)
  236. return strings.EqualFold(kw, string(subStr))
  237. })
  238. if startIndex > -1 {
  239. endIndex = startIndex + len(kw)
  240. } else {
  241. panic(fmt.Sprintf("1===%#v", hit.Highlight))
  242. }
  243. } else {
  244. panic(fmt.Sprintf("2===%#v", hit.Highlight))
  245. }
  246. repoID, fileName := parseIndexerID(hit.Id)
  247. var res = make(map[string]interface{})
  248. if err := json.Unmarshal(hit.Source, &res); err != nil {
  249. return 0, nil, nil, err
  250. }
  251. language := res["language"].(string)
  252. hits = append(hits, &SearchResult{
  253. RepoID: repoID,
  254. Filename: fileName,
  255. CommitID: res["commit_id"].(string),
  256. Content: res["content"].(string),
  257. UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)),
  258. Language: language,
  259. StartIndex: startIndex,
  260. EndIndex: endIndex,
  261. Color: enry.GetColor(language),
  262. })
  263. }
  264. return searchResult.TotalHits(), hits, extractAggs(searchResult), nil
  265. }
  266. func extractAggs(searchResult *elastic.SearchResult) []*SearchResultLanguages {
  267. var searchResultLanguages []*SearchResultLanguages
  268. agg, found := searchResult.Aggregations.Terms("language")
  269. if found {
  270. searchResultLanguages = make([]*SearchResultLanguages, 0, 10)
  271. for _, bucket := range agg.Buckets {
  272. searchResultLanguages = append(searchResultLanguages, &SearchResultLanguages{
  273. Language: bucket.Key.(string),
  274. Color: enry.GetColor(bucket.Key.(string)),
  275. Count: int(bucket.DocCount),
  276. })
  277. }
  278. }
  279. return searchResultLanguages
  280. }
  281. // Search searches for codes and language stats by given conditions.
  282. func (b *ElasticSearchIndexer) Search(repoIDs []int64, language, keyword string, page, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error) {
  283. kwQuery := elastic.NewMultiMatchQuery(keyword, "content")
  284. query := elastic.NewBoolQuery()
  285. query = query.Must(kwQuery)
  286. if len(repoIDs) > 0 {
  287. var repoStrs = make([]interface{}, 0, len(repoIDs))
  288. for _, repoID := range repoIDs {
  289. repoStrs = append(repoStrs, repoID)
  290. }
  291. repoQuery := elastic.NewTermsQuery("repo_id", repoStrs...)
  292. query = query.Must(repoQuery)
  293. }
  294. var (
  295. start int
  296. kw = "<em>" + keyword + "</em>"
  297. aggregation = elastic.NewTermsAggregation().Field("language").Size(10).OrderByCountDesc()
  298. )
  299. if page > 0 {
  300. start = (page - 1) * pageSize
  301. }
  302. if len(language) == 0 {
  303. searchResult, err := b.client.Search().
  304. Index(b.indexerAliasName).
  305. Aggregation("language", aggregation).
  306. Query(query).
  307. Highlight(elastic.NewHighlight().Field("content")).
  308. Sort("repo_id", true).
  309. From(start).Size(pageSize).
  310. Do(context.Background())
  311. if err != nil {
  312. return 0, nil, nil, err
  313. }
  314. return convertResult(searchResult, kw, pageSize)
  315. }
  316. langQuery := elastic.NewMatchQuery("language", language)
  317. countResult, err := b.client.Search().
  318. Index(b.indexerAliasName).
  319. Aggregation("language", aggregation).
  320. Query(query).
  321. Size(0). // We only needs stats information
  322. Do(context.Background())
  323. if err != nil {
  324. return 0, nil, nil, err
  325. }
  326. query = query.Must(langQuery)
  327. searchResult, err := b.client.Search().
  328. Index(b.indexerAliasName).
  329. Query(query).
  330. Highlight(elastic.NewHighlight().Field("content")).
  331. Sort("repo_id", true).
  332. From(start).Size(pageSize).
  333. Do(context.Background())
  334. if err != nil {
  335. return 0, nil, nil, err
  336. }
  337. total, hits, _, err := convertResult(searchResult, kw, pageSize)
  338. return total, hits, extractAggs(countResult), err
  339. }
  340. // Close implements indexer
  341. func (b *ElasticSearchIndexer) Close() {}