Skip to content

Commit 4bdceeb

Browse files
committed
cleanup and better document some regular expressions
1 parent 929c2e8 commit 4bdceeb

File tree

3 files changed

+81
-51
lines changed

3 files changed

+81
-51
lines changed

jsonld-cpp/NQuadsSerialization.cpp

Lines changed: 29 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,14 @@
1212
#include <regex>
1313
#include <iostream>
1414

15+
using namespace RDFRegex;
1516

1617
namespace {
1718

1819
// return vector of strings after splitting input string into lines
1920
std::vector<std::string> splitLines(std::string &input) {
2021
std::vector<std::string> lines;
21-
std::regex rgx(RDFRegex::EOLN);
22+
std::regex rgx(EOLN);
2223
std::sregex_token_iterator i(input.begin(), input.end(), rgx, -1);
2324
std::sregex_token_iterator end;
2425

@@ -75,7 +76,9 @@ namespace {
7576
}
7677

7778
void extractUnicodeCodepoint(const std::smatch &match, std::string &u) {
78-
std::string hex = match[2].matched ? match[2].str() : match[3].str(); // todo: magic numbers?
79+
std::string hex = match[UNICODE_BASIC_MULTILINGUAL_PLANE].matched ?
80+
match[UNICODE_BASIC_MULTILINGUAL_PLANE].str() :
81+
match[UNICODE_HIGHER_PLANE].str();
7982
long v = std::stol(hex, nullptr, 16);
8083

8184
auto it = std::back_inserter(u);
@@ -97,7 +100,7 @@ namespace {
97100
}
98101

99102
bool extractControlCharacter(const std::smatch &match, std::string &u) {
100-
char c = match[1].str()[0];
103+
char c = match[UNICODE_CONTROL_CHARS].str()[0];
101104
switch (c) {
102105
case 'b':
103106
u = "\b";
@@ -108,9 +111,6 @@ namespace {
108111
case 'n':
109112
u = "\n";
110113
break;
111-
case 'v': // todo: why here?
112-
u = "\v";
113-
break;
114114
case 'f':
115115
u = "\f";
116116
break;
@@ -141,7 +141,7 @@ namespace {
141141
if(str.empty())
142142
return;
143143

144-
std::regex charsRgx(RDFRegex::UCHAR_MATCHED);
144+
std::regex charsRgx(UNICODE_CODEPOINT);
145145
auto chars_begin = std::sregex_iterator(str.begin(), str.end(), charsRgx);
146146
auto chars_end = std::sregex_iterator();
147147

@@ -160,7 +160,7 @@ namespace {
160160
out = std::copy(match.prefix().first, match.prefix().second, out);
161161

162162
std::string u;
163-
if (!match[1].matched) {
163+
if (!match[UNICODE_CONTROL_CHARS].matched) {
164164
extractUnicodeCodepoint(match, u);
165165
} else {
166166
if(!extractControlCharacter(match, u))
@@ -293,11 +293,11 @@ std::string NQuadsSerialization::toNQuad(const RDF::RDFTriple& triple) {
293293
}
294294

295295
RDF::RDFDataset NQuadsSerialization::parse(std::string input) {
296-
RDF::RDFDataset dataset((JsonLdOptions()));//todo: should be a version of this that passes in existing options object?
296+
RDF::RDFDataset dataset((JsonLdOptions()));
297297

298298
std::vector<std::string> lines = ::splitLines(input);
299-
std::regex emptyRgx(RDFRegex::EMPTY);
300-
std::regex quadRgx(RDFRegex::QUAD);
299+
std::regex emptyRgx(EMPTY);
300+
std::regex quadRgx(QUAD);
301301
std::smatch match;
302302

303303
int lineNumber = 0;
@@ -311,44 +311,44 @@ RDF::RDFDataset NQuadsSerialization::parse(std::string input) {
311311
// parse quad with regex
312312
if (!std::regex_match(line, match, quadRgx))
313313
throw JsonLdError(JsonLdError::SyntaxError,
314-
"Error while parsing N-Quads; invalid quad. line:" + std::to_string(lineNumber));
314+
"Error while parsing N-Quads; invalid quad. line:" + std::to_string(lineNumber));
315315

316316
// extract subject from matches
317317
std::shared_ptr<RDF::Node> subject;
318-
if(match[1].matched)
319-
subject = std::make_shared<RDF::IRI>(unescape(match[1].str()));
318+
if(match[QUAD_SUBJECT_AS_IRI].matched)
319+
subject = std::make_shared<RDF::IRI>(unescape(match[QUAD_SUBJECT_AS_IRI].str()));
320320
else
321-
subject = std::make_shared<RDF::BlankNode>(unescape(match[2].str()));
321+
subject = std::make_shared<RDF::BlankNode>(unescape(match[QUAD_SUBJECT_AS_BNODE].str()));
322322

323323
// extract predicate from matches
324-
std::shared_ptr<RDF::Node> predicate = std::make_shared<RDF::IRI>(unescape(match[3].str()));
324+
std::shared_ptr<RDF::Node> predicate = std::make_shared<RDF::IRI>(unescape(match[QUAD_PREDICATE].str()));
325325

326326
// extract object from matches
327327
std::shared_ptr<RDF::Node> object;
328-
if(match[4].matched)
329-
object = std::make_shared<RDF::IRI>(unescape(match[4].str()));
330-
else if(match[5].matched)
331-
object = std::make_shared<RDF::BlankNode>(unescape(match[5].str()));
328+
if(match[QUAD_OBJECT_AS_IRI].matched)
329+
object = std::make_shared<RDF::IRI>(unescape(match[QUAD_OBJECT_AS_IRI].str()));
330+
else if(match[QUAD_OBJECT_AS_BNODE].matched)
331+
object = std::make_shared<RDF::BlankNode>(unescape(match[QUAD_OBJECT_AS_BNODE].str()));
332332
else {
333-
std::string language = unescape(match[8].str());
333+
std::string language = unescape(match[QUAD_OBJECT_AS_LITERAL_LANGUAGETAG].str());
334334
std::string datatype;
335-
if (match[7].matched)
336-
datatype = unescape(match[7].str());
335+
if (match[QUAD_OBJECT_AS_LITERAL_DATATYPE].matched)
336+
datatype = unescape(match[QUAD_OBJECT_AS_LITERAL_DATATYPE].str());
337337
else {
338-
if (match[8].matched)
338+
if (match[QUAD_OBJECT_AS_LITERAL_LANGUAGETAG].matched)
339339
datatype = JsonLdConsts::RDF_LANGSTRING;
340340
else
341341
datatype = JsonLdConsts::XSD_STRING;
342342
}
343-
object = std::make_shared<RDF::Literal>(unescape(match[6].str()), &datatype, &language);
343+
object = std::make_shared<RDF::Literal>(unescape(match[QUAD_OBJECT_AS_LITERAL].str()), &datatype, &language);
344344
}
345345

346346
// extract graph name from matches ('@default' is used for the default graph)
347347
std::string name = "@default";
348-
if (match[9].matched) {
349-
name = unescape(match[9].str());
350-
} else if (match[10].matched) {
351-
name = unescape(match[10].str());
348+
if (match[QUAD_GRAPH_AS_IRI].matched) {
349+
name = unescape(match[QUAD_GRAPH_AS_IRI].str());
350+
} else if (match[QUAD_GRAPH_AS_BNODE].matched) {
351+
name = unescape(match[QUAD_GRAPH_AS_BNODE].str());
352352
}
353353

354354
// add RDFTriple to graph in dataset

jsonld-cpp/RDFRegex.cpp

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,54 @@
11
#include "jsonld-cpp/RDFRegex.h"
22

3+
namespace {
34

4-
namespace RDFRegex {
5-
const std::string EOLN(R"((?:\r\n)|(?:\n)|(?:\r))");
65
const std::string WS(R"([ \t]+)");
76
const std::string WSO(R"([ \t]*)");
8-
const std::string EMPTY("^" + WSO + "$");
97

108
const std::string IRI(R"((?:<([^>]*)>))");
11-
const std::string BNODE(R"((_:(?:[A-Za-z_:0-9])(?:(?:[A-Za-z_:0-9\.-])*[A-Za-z_:0-9-])?))");
9+
1210
const std::string PLAIN(R"lit("([^"\\]*(?:\\.[^"\\]*)*)")lit");
1311
const std::string DATATYPE("(?:\\^\\^" + IRI + ")");
1412
const std::string LANGUAGETAG("(?:@([a-z]+(?:-[a-zA-Z0-9]+)*))");
13+
14+
const std::string HEX("[0-9A-Fa-f]");
15+
16+
}
17+
18+
namespace RDFRegex {
19+
20+
const std::string EOLN(R"((?:\r\n)|(?:\n)|(?:\r))");
21+
const std::string EMPTY("^" + WSO + "$");
22+
23+
const std::string BNODE(R"((_:(?:[A-Za-z_:0-9])(?:(?:[A-Za-z_:0-9\.-])*[A-Za-z_:0-9-])?))");
1524
const std::string LANGUAGE("([a-z]+(?:-[a-zA-Z0-9]+)*)");
1625
const std::string LITERAL("(?:" + PLAIN + "(?:" + DATATYPE + "|" + LANGUAGETAG + ")?)");
1726

1827
const std::string SUBJECT("(?:" + IRI + "|" + BNODE + ")" + WS);
19-
const std::string PROPERTY(IRI + WS);
28+
const std::string PREDICATE(IRI + WS);
2029
const std::string OBJECT("(?:" + IRI + "|" + BNODE + "|" + LITERAL + ")" + WSO);
2130
const std::string GRAPH("(?:\\.|(?:(?:" + IRI + "|" + BNODE + ")" + WSO + "\\.))");
2231

23-
const std::string QUAD("^" + WSO + SUBJECT + PROPERTY + OBJECT + GRAPH + WSO + "$");
24-
25-
const std::string HEX("[0-9A-Fa-f]");
26-
const std::string UCHAR_MATCHED(R"(\u005C(?:([tbnrf\\\"'])|(?:u()" + HEX + "{4}))|(?:U(" + HEX + "{8})))");
32+
const std::string QUAD("^" + WSO + SUBJECT + PREDICATE + OBJECT + GRAPH + WSO + "$");
33+
34+
// These indexes represent the 'capturing group' within the large QUAD regex above
35+
const int QUAD_SUBJECT_AS_IRI = 1;
36+
const int QUAD_SUBJECT_AS_BNODE = 2;
37+
const int QUAD_PREDICATE = 3;
38+
const int QUAD_OBJECT_AS_IRI = 4;
39+
const int QUAD_OBJECT_AS_BNODE = 5;
40+
const int QUAD_OBJECT_AS_LITERAL = 6;
41+
const int QUAD_OBJECT_AS_LITERAL_DATATYPE = 7;
42+
const int QUAD_OBJECT_AS_LITERAL_LANGUAGETAG = 8;
43+
const int QUAD_GRAPH_AS_IRI = 9;
44+
const int QUAD_GRAPH_AS_BNODE = 10;
45+
46+
const std::string UNICODE_CODEPOINT(R"(\u005C(?:([tbnrf\\\"'])|(?:u()" + HEX + "{4}))|(?:U(" + HEX + "{8})))");
47+
48+
// These indexes represent the 'capturing group' within the UNICODE_CODEPOINT regex above
49+
const int UNICODE_CONTROL_CHARS = 1;
50+
const int UNICODE_BASIC_MULTILINGUAL_PLANE = 2;
51+
const int UNICODE_HIGHER_PLANE = 3;
2752

2853
}
2954

jsonld-cpp/RDFRegex.h

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,32 +4,37 @@
44
#include <string>
55

66
namespace RDFRegex {
7-
// todo: remove note
8-
// Note: The following regexes for matching RDF data is not quite complete. Please see
9-
// https://www.w3.org/TR/n-quads/#sec-grammar for other items needed when unicode support
10-
// is completed.
7+
118
extern const std::string EOLN;
12-
extern const std::string WS;
13-
extern const std::string WSO;
149
extern const std::string EMPTY;
1510

16-
extern const std::string IRI;
1711
extern const std::string BNODE;
18-
extern const std::string PLAIN;
19-
extern const std::string DATATYPE;
2012
extern const std::string LANGUAGE;
2113
extern const std::string LITERAL;
2214

2315
extern const std::string SUBJECT;
24-
extern const std::string PROPERTY;
16+
extern const std::string PREDICATE;
2517
extern const std::string OBJECT;
2618
extern const std::string GRAPH;
2719

2820
extern const std::string QUAD;
2921

30-
extern const std::string HEX;
31-
extern const std::string UCHAR_MATCHED;
32-
22+
extern const int QUAD_SUBJECT_AS_IRI;
23+
extern const int QUAD_SUBJECT_AS_BNODE;
24+
extern const int QUAD_PREDICATE;
25+
extern const int QUAD_OBJECT_AS_IRI;
26+
extern const int QUAD_OBJECT_AS_BNODE;
27+
extern const int QUAD_OBJECT_AS_LITERAL;
28+
extern const int QUAD_OBJECT_AS_LITERAL_DATATYPE;
29+
extern const int QUAD_OBJECT_AS_LITERAL_LANGUAGETAG;
30+
extern const int QUAD_GRAPH_AS_IRI;
31+
extern const int QUAD_GRAPH_AS_BNODE;
32+
33+
extern const std::string UNICODE_CODEPOINT;
34+
35+
extern const int UNICODE_CONTROL_CHARS;
36+
extern const int UNICODE_BASIC_MULTILINGUAL_PLANE;
37+
extern const int UNICODE_HIGHER_PLANE;
3338

3439
}
3540

0 commit comments

Comments
 (0)