-
Notifications
You must be signed in to change notification settings - Fork 496
/
Copy pathclassifier.py
122 lines (92 loc) · 3.52 KB
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import logging
import string
from argparse import Namespace
from functools import lru_cache
from typing import Any
from typing import Dict
from typing import Optional
from typing import Union
from ..core.plugins import Plugin
from ..plugins.private_key import PrivateKeyDetector
from ..settings import get_settings
Pipeline = Any
logger = logging.getLogger(__name__)
def is_feature_enabled() -> bool:
try:
import torch
import transformers
print(transformers.__version__)
print(torch.__version__)
return True
except Exception:
return False
def is_feature_ready(args: Namespace) -> bool:
try:
temp = vars(args)
answer = True
entries = ['huggingface_model', 'threshold', 'huggingface_token']
for entry in entries:
answer = answer and temp[entry] is not None
return answer
except Exception:
return False
def initialize(
huggingface_model: str = None,
threshold: float = 0.8,
huggingface_token: Optional[str] = None,
) -> None:
"""
:param limit: this limit was obtained through trial and error. Check out
the original pull request for rationale.
:raises: ValueError
"""
path = huggingface_model
get_model(huggingface_model, huggingface_token)
config: Dict[str, Union[float, str, Optional[str]]] = {
'threshold': threshold,
}
if huggingface_model:
config['model'] = huggingface_model
config['huggingface_token'] = huggingface_token
path = f'{__name__}.should_exclude_secret'
get_settings().filters[path] = config
def should_exclude_secret(secret: str, plugin: Optional[Plugin] = None) -> bool:
"""
:param plugin: optional, for easier testing. The dependency injection system
will populate its proper value on complete runs.
"""
# Private keys are actual words, so they will be a false negative.
if isinstance(plugin, PrivateKeyDetector):
return False
if not (set(secret) - set(string.hexdigits + '-')):
return False
model_name = get_settings().filters[f'{__name__}.should_exclude_secret']['model']
token = get_settings().filters[f'{__name__}.should_exclude_secret']['huggingface_token']
threshold = get_settings().filters[f'{__name__}.should_exclude_secret']['threshold']
if not get_model(model_name, token):
raise AssertionError('Attempting to use uninitialized HuggingFace model.')
pipeline = get_model(model_name, token)
result: Dict[str, Union[str, float]] = pipeline(secret)[0]
return result['label'] == 'LABEL_1' and result['score'] >= threshold
@lru_cache(maxsize=1)
def get_model(model_name: str, huggingface_token: str) -> 'Pipeline':
import torch
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_name, token=huggingface_token)
model = model.share_memory()
tokenizer = AutoTokenizer.from_pretrained(model_name, token=huggingface_token)
if torch.cuda.is_available():
logger.info('CUDA is available. Using GPU for Bert model.')
return pipeline(
'text-classification',
model=model,
tokenizer=tokenizer,
device=torch.cuda.current_device(),
)
else:
logger.info('CUDA is not available. Using CPU for Bert model.')
return pipeline(
'text-classification',
model=model_name,
use_auth_token=huggingface_token,
)