Skip to content

Commit

Permalink
Add a flush method to dumper
Browse files Browse the repository at this point in the history
Allows to dump unfinished sentences. Solves issue #173.
  • Loading branch information
kleag committed May 11, 2024
1 parent 79bfaa6 commit c45653d
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 65 deletions.
46 changes: 24 additions & 22 deletions deeplima/apps/deeplima.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ class file_parser
public:
std::shared_ptr<segmentation::ISegmentation> psegm = nullptr;
std::shared_ptr< ITokenSequenceAnalyzer > panalyzer = nullptr;
std::shared_ptr< dumper::AbstractDumper > pdumper = nullptr;
std::shared_ptr< dumper::DumperBase > pDumperBase = nullptr;
std::shared_ptr< dumper::AbstractDumper > pdumper_segm_only = nullptr; // used when using segmentation only
std::shared_ptr< dumper::DumperBase > pdumper_complete = nullptr; // used when using tagger
std::shared_ptr<DependencyParser> parser = nullptr;

/**
Expand Down Expand Up @@ -154,7 +154,7 @@ void init(const std::map<std::string, std::string>& models_fn,
{
// with dependency parsing
auto conllu_dumper = std::make_shared<dumper::AnalysisToConllU<typename DependencyParser::TokenIterator> >();
pDumperBase = conllu_dumper;
pdumper_complete = conllu_dumper;

try
{
Expand Down Expand Up @@ -215,19 +215,20 @@ void init(const std::map<std::string, std::string>& models_fn,
{
// without dependency parsing
auto conllu_dumper = std::make_shared< dumper::AnalysisToConllU<TokenSequenceAnalyzer<>::TokenIterator> >();
pDumperBase = conllu_dumper;
pdumper_complete = conllu_dumper;

for (size_t i = 0; i < panalyzer->get_classes().size(); ++i)
{
conllu_dumper->set_classes(i, panalyzer->get_class_names()[i], panalyzer->get_classes()[i]);
}

panalyzer->register_handler([conllu_dumper](std::shared_ptr< StringIndex > stridx,
const token_buffer_t<>& tokens,
const std::vector<StringIndex::idx_t>& lemmata,
std::shared_ptr< StdMatrix<uint8_t> > classes,
size_t begin,
size_t end)
panalyzer->register_handler([conllu_dumper](
std::shared_ptr< StringIndex > stridx,
const token_buffer_t<>& tokens,
const std::vector<StringIndex::idx_t>& lemmata,
std::shared_ptr< StdMatrix<uint8_t> > classes,
size_t begin,
size_t end)
{
typename TokenSequenceAnalyzer<>::TokenIterator ti(*stridx,
tokens,
Expand All @@ -252,21 +253,21 @@ void init(const std::map<std::string, std::string>& models_fn,
{
switch (out_fmt) {
case 1:
pdumper = std::make_shared<dumper::TokensToConllU>();
pdumper_segm_only = std::make_shared<dumper::TokensToConllU>();
break;
case 2:
pdumper = std::make_shared<dumper::Horizontal>();
pdumper_segm_only = std::make_shared<dumper::Horizontal>();
break;
default:
throw std::runtime_error("Unknown output format");
break;
}
// psegm->register_handler([pdumper]
// psegm->register_handler([pdumper_segm_only]
// (const std::vector<segmentation::token_pos>& tokens,
// uint32_t len)
// {
// // std::cerr << "In psegm handler. Calling pdumper functor" << std::endl;
// (*pdumper)(tokens, len);
// // std::cerr << "In psegm handler. Calling pdumper_segm_only functor" << std::endl;
// (*pdumper_segm_only)(tokens, len);
// });
}

Expand Down Expand Up @@ -315,8 +316,8 @@ void parse_file(std::istream& input,
(const std::vector<segmentation::token_pos>& tokens,
uint32_t len)
{
// std::cerr << "In psegm handler. Calling pdumper functor" << std::endl;
(*pdumper)(tokens, len);
// std::cerr << "In psegm handler. Calling pdumper_segm_only functor" << std::endl;
(*pdumper_segm_only)(tokens, len);
});

}
Expand All @@ -338,7 +339,8 @@ void parse_file(std::istream& input,

// std::cerr << "Waiting for PoS tagger to stop. Calling panalyzer->finalize" << std::endl;
panalyzer->finalize();
// std::cerr << "Analyzer stopped. panalyzer->finalize returned" << std::endl;
pdumper_complete->flush();
std::cerr << "Analyzer stopped. panalyzer->finalize returned" << std::endl;
}

if (parser)
Expand All @@ -353,11 +355,11 @@ void parse_file(std::istream& input,
auto parsing_duration = std::chrono::duration_cast<std::chrono::milliseconds>(parsing_end - parsing_begin);

uint64_t token_counter = 0;
if(nullptr != pdumper)
token_counter = pdumper->get_token_counter();
else if (nullptr != pDumperBase)
if(nullptr != pdumper_segm_only)
token_counter = pdumper_segm_only->get_token_counter();
else if (nullptr != pdumper_complete)
{
token_counter = pDumperBase->get_token_counter();
token_counter = pdumper_complete->get_token_counter();
}
else
{
Expand Down
79 changes: 36 additions & 43 deletions deeplima/include/deeplima/dumper_conllu.h
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,7 @@ class DumperBase
public:
virtual ~DumperBase() = default;
virtual uint64_t get_token_counter() const = 0;
virtual void flush() = 0;
};

template <class I>
Expand Down Expand Up @@ -383,20 +384,35 @@ class AnalysisToConllU : public DumperBase
return feat_str;
}

// void flush()
// {
// if (!m_tokens.empty())
// {
// for (const auto& token: m_tokens)
// {
// std::cerr << token ;
// std::cout << token ;
// }
// std::cout << std::endl;
// }
// m_tokens.clear();
// m_root = 0;
// }
virtual void flush()
{
m_next_token_idx = 1;
std::vector<uint32_t> heads(m_tokens.size()+1);
heads[0] = 0;
for (size_t i = 1; i < heads.size(); i++)
{
heads[i] = m_tokens[i-1].head;
}
// std::cerr << "AnalysisToConllU::flush() heads before find_cycle: " << heads << std::endl;
while (find_cycle(heads, m_root))
{
// std::cerr << "AnalysisToConllU::flush() heads after cycle found: " << heads << std::endl;
}
// std::cerr << "AnalysisToConllU::flush() heads after no more cycle: " << heads << std::endl;
for (size_t i = 1; i < heads.size(); i++)
{
m_tokens[i-1].head = heads[i];
}
for (const auto& token: m_tokens)
{
// std::cerr << "AnalysisToConllU::flush():567 " << token ;
std::cout << token ;
}
m_tokens.clear();
// std::cerr << "after clearing tokens. m_next_token_idx=" << m_next_token_idx << std::endl;
std::cout << std::endl;
m_root = 0;
}

void operator()(I& iter, uint32_t begin, uint32_t end, bool hasDeps = false)
{
Expand All @@ -406,7 +422,7 @@ class AnalysisToConllU : public DumperBase
if (m_next_token_idx == 1)
{
// std::cerr << "AnalysisToConllU::operator() m_next_token_idx=" << m_next_token_idx << std::endl;
// std::cout << std::endl;
std::cout << std::endl;
m_root = 0;
}
else if (m_next_token_idx == 0)
Expand Down Expand Up @@ -447,7 +463,8 @@ class AnalysisToConllU : public DumperBase
temp = str;
continue;
}
// std::cerr << m_next_token_idx << "\t" << str << "\t" << iter.lemma() << "\t" << m_classes[0][iter.token_class(0)];
// std::cerr << m_next_token_idx << "\t" << str << "\t" << iter.lemma()
// << "\t" << m_classes[0][iter.token_class(0)] << std::endl;
token.id = m_next_token_idx;
token.form = str;
token.lemma = iter.lemma();
Expand Down Expand Up @@ -543,33 +560,9 @@ class AnalysisToConllU : public DumperBase
if (iter.flags() & deeplima::segmentation::token_pos::flag_t::sentence_brk ||
iter.flags() & deeplima::segmentation::token_pos::flag_t::paragraph_brk)
{
// std::cerr << "on sent/para break. m_next_token_idx=" << m_next_token_idx << std::endl;
// std::cout << std::endl;
m_next_token_idx = 1;
std::vector<uint32_t> heads(m_tokens.size()+1);
heads[0] = 0;
for (size_t i = 1; i < heads.size(); i++)
{
heads[i] = m_tokens[i-1].head;
}
// std::cerr << "AnalysisToConllU::operator() heads before find_cycle: " << heads << std::endl;
while (find_cycle(heads, m_root))
{
// std::cerr << "AnalysisToConllU::operator() heads after cycle found: " << heads << std::endl;
}
// std::cerr << "AnalysisToConllU::operator() heads after no more cycle: " << heads << std::endl;
for (size_t i = 1; i < heads.size(); i++)
{
m_tokens[i-1].head = heads[i];
}
for (const auto& token: m_tokens)
{
// std::cerr << token ;
std::cout << token ;
}
m_tokens.clear();
// std::cerr << "after clearing tokens. m_next_token_idx=" << m_next_token_idx << std::endl;
std::cout << std::endl;
// std::cerr << "AnalysisToConllU::operator() on sent/para break. m_next_token_idx="
// << m_next_token_idx << std::endl;
flush();
}
iter.next();
}
Expand Down

0 comments on commit c45653d

Please sign in to comment.