Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wikitext - [WIP] #150

Open
wants to merge 30 commits into
base: development
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
2fe5e58
- added language model benchmark from MO-ASHA paper
ayushi-3536 May 17, 2022
83957a7
- added evaluation time as one of the objectives
ayushi-3536 May 17, 2022
918b0f9
- func name correction
ayushi-3536 May 17, 2022
9c5090b
- load and save tokenized file
ayushi-3536 May 17, 2022
f4d4413
-code formatting
ayushi-3536 May 19, 2022
a34d681
-make deterministic
ayushi-3536 May 21, 2022
9cb6a33
-added lock for download data
ayushi-3536 May 21, 2022
68cd917
-make emsize sampling log based: To be discussed with team - position…
ayushi-3536 May 21, 2022
812bdd0
Update Github Actions (#151)
PhMueller May 23, 2022
5f67bb2
-minor cleanup
ayushi-3536 May 24, 2022
49663ce
-minor cleanup
ayushi-3536 May 24, 2022
5d3d75e
Add YAHPO Benchmark (#142)
pfistfl May 30, 2022
ac9547a
ADD Multi-Objective Nasbench201 (v0.0.6) (#152)
PhMueller May 30, 2022
3f08eb2
Benchmark: Fair Adult from MO-ASHA (#148)
ayushi-3536 May 31, 2022
4c4f1d9
Multi Objective CNN benchmark: Flowers and Fashion (#147)
ayushi-3536 Jun 1, 2022
9e471d9
-add gpu support
ayushi-3536 Jun 4, 2022
1dee0c3
- added language model benchmark from MO-ASHA paper
ayushi-3536 May 17, 2022
c862cd6
- added evaluation time as one of the objectives
ayushi-3536 May 17, 2022
2a5d35d
- func name correction
ayushi-3536 May 17, 2022
29c4377
- load and save tokenized file
ayushi-3536 May 17, 2022
ab5f484
-code formatting
ayushi-3536 May 19, 2022
6e6af73
-make deterministic
ayushi-3536 May 21, 2022
9767d5c
-added lock for download data
ayushi-3536 May 21, 2022
f28e783
-make emsize sampling log based: To be discussed with team - position…
ayushi-3536 May 21, 2022
8ad4eaf
-minor cleanup
ayushi-3536 May 24, 2022
4412b70
-minor cleanup
ayushi-3536 May 24, 2022
c44bdfc
-add gpu support
ayushi-3536 Jun 4, 2022
130282e
-add MO abstract client
ayushi-3536 Jun 4, 2022
e7eb353
resolve conflict
ayushi-3536 Jun 4, 2022
f259291
- minimize objective values
ayushi-3536 Jun 4, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions extra_requirements/lm_benchmark.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"lm": [
"torch==1.3.0",
"tqdm"
]
}
368 changes: 368 additions & 0 deletions hpobench/benchmarks/mo/lm_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,368 @@
"""
Changelog:
==========

0.0.1:
* First implementation of the Multi-Objective Language Model Benchmark.
"""
from typing import Union, Dict, List
import ConfigSpace as CS
import numpy as np
import torch
import torch.nn as nn
import logging
import hpobench.util.rng_helper as rng_helper
from hpobench.abstract_benchmark import AbstractMultiObjectiveBenchmark
from hpobench.util.data_manager import LanguageModelDataManager
from hpobench.dependencies.lm.tokenize_util import batchify
from hpobench.dependencies.lm.model import TransformerModel
import time
import math
import tqdm
import random

__version__ = '0.0.1'

logger = logging.getLogger('LM_Bench')


class LanguageModelBenchmark(AbstractMultiObjectiveBenchmark):

def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
super(LanguageModelBenchmark, self).__init__(rng=rng)

self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
data_manager = LanguageModelDataManager(self.device)
self.X_train, self.X_valid, self.X_test = data_manager.load()
self.ntokens = len(data_manager.corpus.dictionary)
self.__seed_everything()
self.variable = {"eval_batch_size": 10,
"nlayers": 2,
"bptt": 35,
"tied": True,
# number of attention head
"nhead": 2,
"ntoken": self.ntokens
}
print("len of corpus dict", self.ntokens)

def __seed_everything(self):
"""Helperfunction: Make the benchmark deterministic by setting the correct seeds"""
seed = self.rng.randint(0, 100000)
print("seed obtained", seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True

@staticmethod
def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
"""Parameter space to be optimized --- contains the hyperparameters
"""
cs = CS.ConfigurationSpace(seed=seed)

cs.add_hyperparameters([
CS.UniformIntegerHyperparameter(
'batch_size', default_value=128, lower=8, upper=256
),
CS.UniformIntegerHyperparameter(
'emsize', default_value=128, lower=32, upper=1024, log=True
),
CS.UniformIntegerHyperparameter(
'lr_factor', default_value=50, lower=1, upper=100, log=True
),
CS.UniformFloatHyperparameter(
'lr', default_value=5, lower=1, upper=50, log=True
),
CS.UniformFloatHyperparameter(
'dropout', default_value=0.99, lower=0, upper=0.99
),
CS.UniformFloatHyperparameter(
'clip', default_value=0.99, lower=0.1, upper=2
)

])
return cs

@staticmethod
def get_objective_names(self) -> List[str]:
return ['log_perplexity', 'accuracy', 'time']

@staticmethod
def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:

fidelity_space = CS.ConfigurationSpace(seed=seed)
fidelity_space.add_hyperparameters([
CS.UniformIntegerHyperparameter(
'budget', lower=1, upper=81, default_value=81, log=False
)
])
return fidelity_space

@staticmethod
def get_meta_information() -> Dict:
""" Returns the meta information for the benchmark """
return {
'name': 'Multi-objective Asynchronous Successive Halving',
'references': ['@article{schmucker2021multi,'
'title={Multi-objective Asynchronous Successive Halving},'
'author={Schmucker, Robin and Donini, Michele and Zafar, Muhammad Bilal and Salinas,'
' David and Archambeau, C{\'e}dric},'
'journal={arXiv preprint arXiv:2106.12639},'
'year={2021}',
],
}

def init_model(self, config: Union[CS.Configuration, Dict]):
""" Function that returns the model initialized based on the configuration and fidelity
"""

if isinstance(config, CS.Configuration):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually we don't need that check (if isinstance) here.

The objective_function -wrapper casts configurations always to dicts.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually we don't need that check (if isinstance) here.

The objective_function -wrapper casts configurations always to dicts.

config = config.get_dictionary()
model = TransformerModel(
self.variable['ntoken'], config['emsize'], self.variable['nhead'], config['emsize'],
self.variable['nlayers'], config['dropout'])

return model

@AbstractMultiObjectiveBenchmark.check_parameters
def objective_function(self, configuration: Union[CS.Configuration, Dict],
fidelity: Union[Dict, CS.Configuration, None] = None,
rng: Union[np.random.RandomState, int, None] = None,
shuffle: bool = False,
**kwargs) -> Dict:
"""

Parameters
----------
configuration
fidelity: Dict, None
epoch: int - Values: [1, 81]
Number of epochs an architecture was trained.
Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)

Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
rng : np.random.RandomState, int, None
Random seed to use in the benchmark.

To prevent overfitting on a single seed, it is possible to pass a
parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
If this parameter is not given, the default random state is used.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add shuffle here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add shuffle here.


kwargs

Returns
-------
Dict -
function_value : Dict
validation_accuracy: float
log_perplexity: float
cost : time to train the network
info : Dict
validation_accuracy : float,
test_accuracy : float,
log_perplexity : float,
negative_log_perplexity : float,
training_cost : float,
valid_cost : float,
test_cost : float,
fidelity : Dict
used fidelities in this evaluation
"""

self.rng = rng_helper.get_rng(rng)
self.__seed_everything()

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
ts_start = time.time()

# batchify data
batch_size = configuration['batch_size']
train_data = batchify(self.X_train, batch_size=batch_size).to(device)
val_data = batchify(self.X_valid, batch_size=self.variable["eval_batch_size"]).to(device)
test_data = batchify(self.X_test, batch_size=self.variable["eval_batch_size"]).to(device)

epochs = fidelity['budget']

model = self.init_model(configuration).to(device)

criterion = nn.CrossEntropyLoss()

learning_rate = configuration['lr']
learning_rate_factor = configuration['lr_factor']
clip = configuration['clip']
best_val_loss = None
train_time = 0
eval_time = 0

t = tqdm.tqdm(total=epochs)
for epoch in range(epochs):
epoch_start_time = time.time()
train_loss, train_acc = model.train_fun(self.ntokens, criterion, train_data, learning_rate, clip)
train_time += time.time() - epoch_start_time
start = time.time()
val_loss, val_acc = model.eval_fun(self.ntokens, criterion, val_data)
val_loss = np.clip(val_loss, 1e-10, 10)
print("val acc for last epoch", val_acc)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

logging.debug
see above

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

logging.debug
see above

eval_time += start - time.time()

t.set_postfix(val_accuracy=val_acc)
t.update()

if not np.isfinite(val_loss):
val_loss = 7

# Save the model if the validation loss is the best we've seen so far.
if not best_val_loss or val_loss < best_val_loss:
best_val_loss = val_loss
else:
# Anneal the learning rate if no improvement has been seen in the validation dataset.
learning_rate /= learning_rate_factor

start_time = time.time()
_, test_acc = model.eval_fun(self.ntokens, criterion, test_data)
eval_test_runtime = time.time() - start_time

perplexity = math.exp(best_val_loss)
log_perplexity = best_val_loss
neg_log_perplexity = 10 - best_val_loss
elapsed_time = float(ts_start - time.time())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

elapsed_time is already float

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

elapsed_time is already float


return {'function_value': {'log_perplexity': log_perplexity,
'accuracy': val_acc.item(),
'time': train_time + eval_time
},
'cost': elapsed_time,
'info': {'train_accuracy': train_acc.item(),
'validation_accuracy': val_acc.item(),
'test_accuracy': test_acc.item(),
'log_perplexity': log_perplexity,
'perplexity': perplexity,
'negative_log_perplexity': neg_log_perplexity,
'training_cost': train_time,
'valid_cost': eval_time,
'test_cost': eval_test_runtime,
'fidelity': fidelity
}
}

@AbstractMultiObjectiveBenchmark.check_parameters
def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
fidelity: Union[Dict, None] = None,
rng: Union[np.random.RandomState, int, None] = None,
shuffle: bool = False,
**kwargs) -> Dict:
"""
Get the validated results. Runs a given configuration on the largest budget (here: 50).
Parameters
----------
configuration
fidelity: Dict, None
epoch: int - Values: [1, 81]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We only allow 81.

Maybe this is something we should discuss:
It could be okay to query also the performance for different epoch steps

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We only allow 81.

Maybe this is something we should discuss:
It could be okay to query also the performance for different epoch steps

Number of epochs an architecture was trained.
Note: the number of epoch is 1 indexed. (Results after the first epoch: epoch = 1)

Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
rng : np.random.RandomState, int, None
Random seed to use in the benchmark.

To prevent overfitting on a single seed, it is possible to pass a
parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
If this parameter is not given, the default random state is used.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shuffleis missing

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shuffleis missing

kwargs
Returns
-------
Dict -
function_value : Dict
validation_accuracy: float
log_perplexity: float
cost : time to train the network
info : Dict
validation_accuracy : float,
test_accuracy : float,
log_perplexity : float,
negative_log_perplexity : float,
training_cost : float,
valid_cost : float,
test_cost : float,
fidelity : Dict
used fidelities in this evaluation
"""

# The result dict should contain already all necessary information -> Just swap the function value from valid
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

outdated comment.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

outdated comment.

# to test and the corresponding time cost
assert fidelity['epoch'] == 81, 'Only test data for the 50. epoch is available. '
ts_start = time.time()

self.rng = rng_helper.get_rng(rng)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
self.rng = rng_helper.get_rng(rng)
self.rng = rng_helper.get_rng(self.rng, rng)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
self.rng = rng_helper.get_rng(rng)
self.rng = rng_helper.get_rng(self.rng, rng)

self.__seed_everything()

# batchify data
batch_size = configuration['batch_size']
train_data = batchify(self.X_train, batch_size=batch_size)
val_data = batchify(self.X_valid, batch_size=batch_size)
train_data = np.vstack((train_data, val_data))
train_data = torch.tensor(train_data).to(self.device)
test_data = batchify(self.X_test, batch_size=self.variable["eval_batch_size"]).to(self.device)

epochs = fidelity['budget']

model = self.init_model(configuration).to(self.device)

criterion = nn.CrossEntropyLoss()

learning_rate = configuration['lr']
learning_rate_factor = configuration['lr_factor']
clip = configuration['clip']
best_test_loss = None
train_time = 0
eval_time = 0
t = tqdm.tqdm(total=epochs)
for epoch in range(1, epochs + 1):
epoch_start_time = time.time()
train_loss, train_acc = model.train_fun(self.ntokens, criterion, train_data, learning_rate,
clip)
train_time += time.time() - epoch_start_time
start = time.time()

test_loss, test_acc = model.eval_fun(self.ntokens, criterion, test_data)
test_loss = np.clip(test_loss, 1e-10, 10)
eval_time += time.time() - start

t.set_postfix(test_accuracy=test_acc)
t.update()
if not np.isfinite(test_loss):
test_loss = 7

# Save the model if the validation loss is the best we've seen so far.
if not best_test_loss or test_loss < best_test_loss:
best_test_loss = test_loss
else:
# Anneal the learning rate if no improvement has been seen in the validation dataset.
learning_rate /= learning_rate_factor

perplexity = math.exp(best_test_loss)
log_perplexity = best_test_loss
neg_log_perplexity = 10 - best_test_loss
elapsed_time = float(ts_start - time.time())

return {'function_value': {'log_perplexity': log_perplexity,
'accuracy': test_acc.item(),
'time': train_time + eval_time
},
'cost': elapsed_time,
'info': {'train_accuracy': train_acc.item(),
'test_accuracy': test_acc.item(),
'log_perplexity': log_perplexity,
'perplexity': perplexity,
'negative_log_perplexity': neg_log_perplexity,
'training_cost': train_time,
'test_cost': eval_time,
'fidelity': fidelity
}
}

__all__ = ["LanguageModelBenchmark"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a new line at the end of file.

-> checkstyle error

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a new line at the end of file.

-> checkstyle error

12 changes: 12 additions & 0 deletions hpobench/container/benchmarks/mo/lm_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
""" Benchmark for the Multi-Objective Language Model Benchmark from hpobench/benchmarks/mo/lm_benchmark.py
"""

from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient


class LanguageModelBenchmark(AbstractBenchmarkClient):
def __init__(self, **kwargs):
kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'LanguageModelBenchmark')
kwargs['container_name'] = kwargs.get('container_name', 'lm_benchmark')
kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
super(LanguageModelBenchmark, self).__init__(**kwargs)
Loading