-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcld2nlpt.go
210 lines (188 loc) · 6.52 KB
/
cld2nlpt.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
// Package cld2 implements language detection using the
// Compact Language Detector.
//
// The `nlpt` part is a side project of mine for a Natural Language Processing Toolkit in go.
//
// This package includes the relevant sources from the cld2
// project, so it doesn't require any external dependencies.
// It also uses the cld2_nlpt.h and cld2_nlpt.cc files to create a specific CLD2_* namespace
// to distinguish usage here with the original c++ project.
//
// For more information about CLD2, see https://code.google.com/p/cld2/.
//
// This package leaned heavily on two existing projects:
// cld2 go wrapper for extracting relevant code: https://github.com/rainycape/cld2
// rust-cld2 wrapper for creating custom header and c++ files for the CLD2_* namespace https://github.com/emk/rust-cld2
//
package cld2
/*
#cgo CXXFLAGS: -std=c++03
#include <stdlib.h>
#include "cld2nlpt.h"
*/
import "C"
import (
"fmt"
"time"
"unsafe"
)
/*
type LanguageDialect string
type LanguageInfo struct {
Language Language // language code; "en"
Dialect LanguageDialect // languace code + dialect; "en-uk"
Scores [3]LanguageScore // the 3 most likely languages
Reliable bool // is the result reliable?
}
type LanguageScore struct {
Dialect LanguageDialect
Percent int // probability/"confidence"
}
func DetectLanguageInfo() LanguageInfo {
...
}
type Cld2Hints struct {
ContentLanguageHint string
TldHint string
EncodingHint int
LanguageHint int
}
c_hints := Cld2Hints{
ContentLanguageHint: C.CString(""),
TldHint: C.CString(""),
EncodingHint: C.int(0),
LanguageHint: C.int(0),
}
*/
type Language string
type Cld2NlptError struct {
When time.Time
Msg string
}
func (e Cld2NlptError) Error() string {
return fmt.Sprint("%v, %v", e.When, e.Msg)
}
// SimpleDetect returns the language code for detected language in the given text.
// It uses nlpt_wrapper.h and returns the language code, eg. 'en'.
// C++ function sets the buffer length to the that of the text.
//
// By default it defines plain text as true, reliable as true. This means it will not strip out HTML tags, returns only detection rankings that are know to be reliable.
//
// It also defaults return value to UNKOWN_LANGUAGE before running detection. If a language cannot be reliably detected then UNKOWN will be returned.
// bool is_plain_text = true;
// bool is_reliable = true;
// ...
// CLD2::Language summary_lang = CLD2::UNKNOWN_LANGUAGE;
//
func SimpleDetect(text string) (lang Language, err error) {
cs := C.CString(text)
res := C.CLD2_Static_ExtDetectLanguageSummary(cs)
defer C.free(unsafe.Pointer(cs))
if res != nil {
lang = Language(C.GoString(res))
return lang, err
} else {
err = Cld2NlptError{
time.Date(1989, 3, 15, 22, 30, 0, 0, time.UTC),
"result returned nil: C.CLD2_Static_ExtDetectLanguageSummary(cs)",
}
return lang, err
}
return
}
// DetectLanguage uses nlpt_wrapper.h and returns a format of the output.
// cld2 defualts languages to ENGLISH, and so any unreliability returns default; which can yeild wrong results, expecially for small data sets.
// By default it defines plain text as true and reliable as true. This means it will not strip out HTML tags and returns only detection rankings that are know to be reliable.
// bool is_plain_text = true;
// bool is_reliable = true;
//
// Format Choices
// 'name' returns 'ENGLISH'
// 'code' returns 'en'
// 'declname' returns 'ENGLISH'
//
// If the buffer_length is less than or equal to zero the C++ code will set the length to the that of the text.
// See cld2_nlpt_test for usage.
func DetectLanguage(buffer_length int, text, format string) (lang Language, err error) {
c_buffer := C.int(buffer_length)
c_string := C.CString(text)
var c_char = C.CString("")
defer C.free(unsafe.Pointer(c_char))
defer C.free(unsafe.Pointer(c_string))
var lang_result C.Language = C.CLD2_DetectLanguage(c_string, c_buffer)
switch {
case format == "name":
c_char = C.CLD2_LanguageName(lang_result)
case format == "code":
c_char = C.CLD2_LanguageCode(lang_result)
case format == "declname":
c_char = C.CLD2_LanguageDeclaredName(lang_result)
default:
c_char = C.CLD2_LanguageCode(lang_result)
}
if c_char != nil {
lang = Language(C.GoString(c_char))
return lang, err
} else {
err = Cld2NlptError{
time.Date(1989, 3, 15, 22, 30, 0, 0, time.UTC),
"result returned nil: C.CLD2_LanguageName(C.CLD2_DetectLanguage(cs, b_length))",
}
return lang, err
}
return
}
// DetectExtendedLanguage uses nlpt_wrapper.h and returns a format of the output.
// By default it defines plain text as true, reliable as true, and language hints as unknown. This means it will not strip out HTML tags, returns only detection rankings that are know to be reliable, and in the case of no reliability returns UNKNOWN.
// bool is_plain_text = true;
// bool is_reliable = true;
// ...
// CLD2::CLDHints cldhints = {NULL, NULL, 0, CLD2::UNKNOWN_LANGUAGE};
//
// It provides the choice to select out of index of ranked languages, the percent, and normal_score. See cld2 for more info on what these choices mean. Default to 3 for most accuracy.
// Format Choices
// 'name' returns 'ENGLISH'
// 'code' returns 'en'
// 'declname' returns 'ENGLISH'
//
// If the buffer_length is less than or equal to zero the C++ code will set the length to the that of the text.
// See cld2_nlpt_test for usage.
//
// TODO: add support for passing language hints. will require mapping table for the c++ table of supported languages.
func DetectExtendedLanguage(text string, format string, buffer_length, rank, percent, normal_score int) (lang Language, err error) {
c_buffer := C.int(buffer_length)
c_string := C.CString(text)
var c_char = C.CString("")
defer C.free(unsafe.Pointer(c_char))
defer C.free(unsafe.Pointer(c_string))
c_rank := C.int(rank)
c_percent := C.int(percent)
c_normal_score := C.int(normal_score)
var lang_result C.Language = C.CLD2_DetectExtendLanguageSummary(
c_string,
c_buffer,
c_rank,
c_percent,
c_normal_score)
switch {
case format == "name":
c_char = C.CLD2_LanguageName(lang_result)
case format == "code":
c_char = C.CLD2_LanguageCode(lang_result)
case format == "declname":
c_char = C.CLD2_LanguageDeclaredName(lang_result)
default:
c_char = C.CLD2_LanguageCode(lang_result)
}
if c_char != nil {
lang = Language(C.GoString(c_char))
return lang, err
} else {
err = Cld2NlptError{
time.Date(1989, 3, 15, 22, 30, 0, 0, time.UTC),
"result returned nil: C.CLD2_DetectExtendLanguageSummary",
}
return lang, err
}
return
}