Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Working version, needs improvement [WIP] #39

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
2 changes: 1 addition & 1 deletion cmd/license-detector/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ func process(arg string) ([]match, error) {
return nil, err
}

ls, err := licensedb.Detect(resolvedFiler)
ls, _, err := licensedb.Detect(resolvedFiler)
if err != nil {
return nil, err
}
Expand Down
2 changes: 1 addition & 1 deletion licensedb/dataset_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ func TestDataset(t *testing.T) {
for _, project := range projects {
go func(project filer.File) {
defer wg.Done()
myLicenses, _ := Detect(filer.NestFiler(rootFiler, project.Name))
myLicenses, _, _ := Detect(filer.NestFiler(rootFiler, project.Name))
if len(myLicenses) > 0 {
mutex.Lock()
licenses[project.Name] = myLicenses
Expand Down
25 changes: 25 additions & 0 deletions licensedb/internal/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -460,3 +460,28 @@ func tfidf(freq int, docfreq int, ndocs int) float32 {
}
return weight
}

func (db *database) QuerySourceFile(text string) map[string]float32 {
candidates := map[string]float32{}
append := func(others map[string]float32) {
for key, val := range others {
if candidates[key] < val {
candidates[key] = val
}
}
}
append(db.QueryLicenseText(string(text)))
// if len(candidates) == 0 {
// append(investigateSourceFile(text, db.nameSubstrings, db.nameSubstringSizes))
// if len(candidates) == 0 {
// append(investigateSourceFile(text, db.nameShortSubstrings, db.nameShortSubstringSizes))
// }
// }
if db.debug {
for key, val := range candidates {
println("NLP", key, val)
}
}
db.addURLMatches(candidates, text)
return candidates
}
114 changes: 114 additions & 0 deletions licensedb/internal/investigation.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (

"gopkg.in/src-d/go-license-detector.v2/licensedb/filer"
"gopkg.in/src-d/go-license-detector.v2/licensedb/internal/processors"
"gopkg.in/src-d/enry.v1"
)

var (
Expand Down Expand Up @@ -62,6 +63,36 @@ var (

licenseDirectoryRe = regexp.MustCompile(fmt.Sprintf(
"^(%s)$", strings.Join(licenseFileNames, "|")))

commentSyntaxesRe = map[string]*regexp.Regexp {
"ANTLR": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"C": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"C++": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"C#": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"CSS": regexp.MustCompile(`\/\*(.*?\t?\r?\n?)+?\*\/`),
"Go": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"HTML": regexp.MustCompile(`<\!--(.*?\t?\r?\n?)+?-->`),
"Haskel": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\{-(.*?\t?\r?\n?)+?\-\})`),
"Java": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"JavaScript": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"Matlab": regexp.MustCompile(`(%.*\t?\r?\n?)|(%\{(.?\t?\r?\n?)+?%\})`),
"Objective-C": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"Perl": regexp.MustCompile(`(#.*\t?\r?\n?)|(=begin(.*?\t?\r?\n?)+?=cut)`),
"PHP": regexp.MustCompile(`(#.*\t?\r?\n?)|(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"Python": regexp.MustCompile("('''(.?\t?\r?\n?)+?''')|(#.*\t?\r?\n?)|(\"\"\"(.?\t?\r?\n?)+?\"\"\")"),
"Ruby": regexp.MustCompile(`(#.*\t?\r?\n?)|(=begin(.*?\t?\r?\n?)+?=end)`),
"Rust": regexp.MustCompile(`\/\*(.*?\t?\r?\n?)+?\*\/`),
"R": regexp.MustCompile(`#.*\t?\r?\n?`),
"Shell": regexp.MustCompile(`#.*\t?\r?\n?`),
"Swift": regexp.MustCompile(`(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"SAS": regexp.MustCompile(`(\*(.*?\t?\r?\n?)+?;)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"Scala": regexp.MustCompile(`(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"SQL": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"TypeScript": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"YAML": regexp.MustCompile(`#.*\t?\r?\n?`),
}

cleanCommentsRe = regexp.MustCompile(`#|\*|\/|=begin|=cut|=end`)
)

// ExtractLicenseFiles returns the list of possible license texts.
Expand Down Expand Up @@ -157,3 +188,86 @@ func InvestigateReadmeText(text []byte, fs filer.Filer) map[string]float32 {
func IsLicenseDirectory(fileName string) bool {
return licenseDirectoryRe.MatchString(strings.ToLower(fileName))
}

// ExtractSourceFiles searches for source code files and their returns header comments, when available.
// Enry is used to get possible valuable files.
func ExtractSourceFiles(files []string, fs filer.Filer) ([][]byte, []string) {
candidates := [][]byte{}
fileNames := []string{}
langs := []string{}
commentsFileName := []string{}
for _, file := range files {
text, err := fs.ReadFile(file)
if err == nil {
lang := enry.GetLanguage(file, text)
langs = append(langs, lang)
candidates = append(candidates, text)
fileNames = append(fileNames, file)
}
}
if len(candidates) > 0 {
candidates, commentsFileName = ExtractHeaderComments(candidates, langs, fileNames)
}
return candidates, commentsFileName
}

// ExtractHeaderComments searches in source code files for header comments and outputs license text on them them.
func ExtractHeaderComments(candidates [][]byte, langs []string, fileNames []string) ([][]byte, []string) {
comments := [][]byte{}
commentsFileName := []string{}
var unsupportedTypes string
for i, candidate := range candidates {
candidateLang := langs[i]
if reg, exists := commentSyntaxesRe[candidateLang]; exists {
candidateHeader := candidate
if len(candidateHeader) > 1024 {
candidateHeader = candidate[:1024]
}
if match := reg.FindAllString(string(candidateHeader), -1); match != nil {
commentsFileName = append(commentsFileName, fileNames[i])
var matchText string
for _, m := range match {
matchText += cleanCommentsRe.ReplaceAllString(m, "")
}
comments = append(comments, []byte(matchText))
}
} else {
match, _ := regexp.Match(candidateLang, []byte(unsupportedTypes))
if match == false {
unsupportedTypes += candidateLang + ", "
}
}
}
if len(unsupportedTypes) > 0 {
unsupportedTypes = unsupportedTypes[:len(unsupportedTypes)-2]
fmt.Println("The following file types were not investigated for licenses on the comments:", unsupportedTypes + ". ")
}
return comments, commentsFileName
}

// InvestigateHeaderComments scans the header comments for licensing information and outputs the
// probable names using NER.
func InvestigateHeaderComments(texts [][]byte, fs filer.Filer, commentsFileName []string) (map[string]float32, []string) {
maxLicenses := map[string]float32{}
licensesFileNames := []string{}
// TO DO: output max license per file, not files with licenses + licenses found
for i, text := range texts {
candidates := InvestigateHeaderComment(text)
if len(candidates) > 0 {
licensesFileNames = append(licensesFileNames, commentsFileName[i])
for name, sim := range candidates {
maxSim := maxLicenses[name]
if sim > maxSim {
maxLicenses[name] = sim
}
}
}
}
return maxLicenses, licensesFileNames
}

// InvestigateHeaderComment scans the header comments for licensing information and outputs probable
// names found with Named Entity Recognition from NLP.
func InvestigateHeaderComment(text []byte) map[string]float32 {
return globalLicenseDatabase().QuerySourceFile(string(text))
}
11 changes: 11 additions & 0 deletions licensedb/internal/nlp.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,14 @@ func splitLicenseName(name string) []substring {
})
return result
}

func investigateSourceFile(
text string, licenseNameParts map[string][]substring,
licenseNameSizes map[string]int) map[string]float32 {
// TO DO: split license-comments from description-comments
// =====
// ----
// \n\n\n
// import
return map[string]float32{}
}
43 changes: 35 additions & 8 deletions licensedb/licensedb.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ var (

// Detect returns the most probable reference licenses matched for the given
// file tree. Each match has the confidence assigned, from 0 to 1, 1 means 100% confident.
func Detect(fs filer.Filer) (map[string]float32, error) {
func Detect(fs filer.Filer) (map[string]float32, []string, error) {
files, err := fs.ReadDir("")
if err != nil {
return nil, err
return nil, nil, err
}
fileNames := []string{}
for _, file := range files {
Expand All @@ -39,16 +39,43 @@ func Detect(fs filer.Filer) (map[string]float32, error) {
candidates := internal.ExtractLicenseFiles(fileNames, fs)
licenses := internal.InvestigateLicenseTexts(candidates)
if len(licenses) > 0 {
return licenses, nil
return licenses, nil, nil
}
// Plan B: take the README, find the section about the license and apply NER
candidates = internal.ExtractReadmeFiles(fileNames, fs)
if len(candidates) == 0 {
return nil, ErrNoLicenseFound
if len(candidates) > 0 {
licenses = internal.InvestigateReadmeTexts(candidates, fs)
if len(licenses) > 0 {
return licenses, nil, nil
}
}

// Plan C: look for licence texts in source code files with comments at header
extendedFileNames := []string{}
commentsFileName := []string{}
licensesFileNames := []string{}
extendedFileNames = extractAllSubfiles(fs, extendedFileNames, "")
candidates, commentsFileName = internal.ExtractSourceFiles(extendedFileNames, fs)
if len(candidates) > 0 {
licenses, licensesFileNames = internal.InvestigateHeaderComments(candidates, fs, commentsFileName)
}
licenses = internal.InvestigateReadmeTexts(candidates, fs)
if len(licenses) == 0 {
return nil, ErrNoLicenseFound
return nil, nil, ErrNoLicenseFound
}
return licenses, licensesFileNames, nil
}

func extractAllSubfiles(fs filer.Filer, fileNames []string, path string) []string {
files, err := fs.ReadDir(path)
if err == nil {
for _, subfile := range files {
currentPath := paths.Join(path, subfile.Name)
if subfile.IsDir {
fileNames = extractAllSubfiles(fs, fileNames, currentPath)
} else {
fileNames = append(fileNames, currentPath)
}
}
}
return licenses, nil
return fileNames
}