|
- package enry
-
- import (
- "math"
- "sort"
-
- "github.com/src-d/enry/v2/internal/tokenizer"
- )
-
- // Classifier is the interface in charge to detect the possible languages of the given content based on a set of
- // candidates. Candidates is a map which can be used to assign weights to languages dynamically.
- type Classifier interface {
- Classify(content []byte, candidates map[string]float64) (languages []string)
- }
-
- type classifier struct {
- languagesLogProbabilities map[string]float64
- tokensLogProbabilities map[string]map[string]float64
- tokensTotal float64
- }
-
- type scoredLanguage struct {
- language string
- score float64
- }
-
- // Classify returns a sorted slice of possible languages sorted by decreasing language's probability
- func (c *classifier) Classify(content []byte, candidates map[string]float64) []string {
-
- var languages map[string]float64
- if len(candidates) == 0 {
- languages = c.knownLangs()
- } else {
- languages = make(map[string]float64, len(candidates))
- for candidate, weight := range candidates {
- if lang, ok := GetLanguageByAlias(candidate); ok {
- candidate = lang
- }
-
- languages[candidate] = weight
- }
- }
-
- empty := len(content) == 0
- scoredLangs := make([]*scoredLanguage, 0, len(languages))
-
- var tokens []string
- if !empty {
- tokens = tokenizer.Tokenize(content)
- }
-
- for language := range languages {
- score := c.languagesLogProbabilities[language]
- if !empty {
- score += c.tokensLogProbability(tokens, language)
- }
- scoredLangs = append(scoredLangs, &scoredLanguage{
- language: language,
- score: score,
- })
- }
-
- return sortLanguagesByScore(scoredLangs)
- }
-
- func sortLanguagesByScore(scoredLangs []*scoredLanguage) []string {
- sort.Stable(byScore(scoredLangs))
- sortedLanguages := make([]string, 0, len(scoredLangs))
- for _, scoredLang := range scoredLangs {
- sortedLanguages = append(sortedLanguages, scoredLang.language)
- }
-
- return sortedLanguages
- }
-
- func (c *classifier) knownLangs() map[string]float64 {
- langs := make(map[string]float64, len(c.languagesLogProbabilities))
- for lang := range c.languagesLogProbabilities {
- langs[lang]++
- }
-
- return langs
- }
-
- func (c *classifier) tokensLogProbability(tokens []string, language string) float64 {
- var sum float64
- for _, token := range tokens {
- sum += c.tokenProbability(token, language)
- }
-
- return sum
- }
-
- func (c *classifier) tokenProbability(token, language string) float64 {
- tokenProb, ok := c.tokensLogProbabilities[language][token]
- if !ok {
- tokenProb = math.Log(1.000000 / c.tokensTotal)
- }
-
- return tokenProb
- }
-
- type byScore []*scoredLanguage
-
- func (b byScore) Len() int { return len(b) }
- func (b byScore) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
- func (b byScore) Less(i, j int) bool { return b[j].score < b[i].score }
|