Fix problems with parsers

look · look · commit d6e15f2a7d09 · 2017-06-13T23:15:45.000-07:00
- Due to only matching ASCII, any non-ASCII character caused a parse failure.
  This is annoying. Fixed by matching "not whitespace" instead of alphanumerics.
  Added tests for non-ASCII characters in query string.
- Related to the above: a boolean operator would delimit a clause (no space
  required between terms) because +/- were not allowed in terms. Using "not
  whitespace" also fixes this. Added tests.
- Added tests for unbalanced quotation marks in phrase query parser.
- Added a test to document that balanced quotation marks can delimit clauses.
  We'll call this a feature. It is possible to eliminate using lookahead.
- Fix heuristic parser breaking terms that start with a decade string by using
  lookahead.
diff --git a/README.md b/README.md
@@ -69,6 +69,5 @@ The tutorial is under copyright and cannot be republished without my permission.
 
 ## TODO
 
-- [] decade fix -- requires lookahead to make sure decade actually is. Need to backport fix
 - [] performance?
 - [] final copy edits
diff --git a/boolean_term_parser.rb b/boolean_term_parser.rb
@@ -4,10 +4,10 @@ module BooleanTermParser
   # This query parser adds an optional operator ("+" or "-") to the simple term
   # parser. In order to do that, a new "clause" node is added to the parse tree.
   class QueryParser < Parslet::Parser
-    rule(:term) { match('[a-zA-Z0-9]').repeat(1).as(:term) }
+    rule(:term) { match('[^\s]').repeat(1).as(:term) }
     rule(:operator) { (str('+') | str('-')).as(:operator) }
     rule(:clause) { (operator.maybe >> term).as(:clause) }
-    rule(:space)  { match('\s').repeat(1) }
+    rule(:space) { match('\s').repeat(1) }
     rule(:query) { (clause >> space.maybe).repeat.as(:query) }
     root(:query)
   end
diff --git a/heuristic_parser.rb b/heuristic_parser.rb
@@ -5,12 +5,14 @@ module HeuristicParser
   # It adds a new clause type for date ranges. The parser recognizes strings
   # like "1920s" or "2010" as dates instead of generic terms.
   class QueryParser < Parslet::Parser
+    rule(:eof) { any.absent? }
     rule(:decade) do
       ((str('1') >> str('9') |
         str('2') >> str('0')) >>
-       match('\d') >> str('0')).as(:decade) >> str('s').maybe
+       match('\d') >> str('0')).as(:decade) >>
+        str('s').maybe >> (eof | space).present?
     end
-    rule(:term) { match('[a-zA-Z0-9]').repeat(1).as(:term) }
+    rule(:term) { match('[^\s"]').repeat(1).as(:term) }
     rule(:quote) { str('"') }
     rule(:operator) { (str('+') | str('-')).as(:operator) }
     rule(:phrase) { (quote >> (term >> space.maybe).repeat >> quote).as(:phrase) }
diff --git a/phrase_parser.rb b/phrase_parser.rb
@@ -5,7 +5,7 @@ module PhraseParser
   # terms. This is done creating multiple types of clauses instead of just one.
   # A phrase clause generates an Elasticsearch match_phrase query.
   class QueryParser < Parslet::Parser
-    rule(:term) { match('[a-zA-Z0-9]').repeat(1).as(:term) }
+    rule(:term) { match('[^\s"]').repeat(1).as(:term) }
     rule(:quote) { str('"') }
     rule(:operator) { (str('+') | str('-')).as(:operator) }
     rule(:phrase) { (quote >> (term >> space.maybe).repeat >> quote).as(:phrase) }
diff --git a/term_parser.rb b/term_parser.rb
@@ -1,10 +1,10 @@
 require 'parslet'
 
 module TermParser
-  # This is a simple parser that matches a sequence of alphanumeric characters and
-  # converts it to an Elasticsearch match query.
+  # This is a simple parser that matches a sequence of non-whitespace characters
+  # and converts it to an Elasticsearch match query.
   class QueryParser < Parslet::Parser
-    rule(:term) { match('[a-zA-Z0-9]').repeat(1).as(:term) }
+    rule(:term) { match('[^\s]').repeat(1).as(:term) }
     rule(:space) { match('\s').repeat(1) }
     rule(:query) { (term >> space.maybe).repeat.as(:query) }
     root(:query)
diff --git a/tests/boolean_term_parser_tests.rb b/tests/boolean_term_parser_tests.rb
@@ -1,3 +1,4 @@
+# coding: utf-8
 require 'minitest/autorun'
 require_relative '../boolean_term_parser'
 
@@ -28,4 +29,27 @@ def test_multiple_terms_with_operators
                            {:clause => {:operator => '-', :term => 'cat'}}]}
     assert_equal(expected, tree)
   end
+
+  def test_non_ascii_characters
+    tree = BooleanTermParser::QueryParser.new.parse('+føé -ba∑ ∫åñ')
+    expected = {:query => [{:clause => {:operator => '+', :term => 'føé'}},
+                           {:clause => {:operator => '-', :term => 'ba∑'}},
+                           {:clause => {:term => '∫åñ'}}]}
+    assert_equal(expected, tree)
+  end
+
+  def test_operators_in_terms
+    tree = BooleanTermParser::QueryParser.new.parse('-foo+term +bar-term baz-term')
+    expected = {:query => [{:clause => {:operator => '-', :term => 'foo+term'}},
+                           {:clause => {:operator => '+', :term => 'bar-term'}},
+                           {:clause => {:term => 'baz-term'}}]}
+    assert_equal(expected, tree)
+  end
+
+  def test_quotation_marks
+    tree = BooleanTermParser::QueryParser.new.parse('+fo"o -ba"r')
+    expected = {:query => [{:clause => {:operator => '+', :term => 'fo"o'}},
+                           {:clause => {:operator => '-', :term => 'ba"r'}}]}
+    assert_equal(expected, tree)
+  end
 end
diff --git a/tests/heuristic_parser_tests.rb b/tests/heuristic_parser_tests.rb
@@ -8,11 +8,29 @@ def test_date_range
   end
 
   def test_complex_query
-    tree = HeuristicParser::QueryParser.new.parse('awesome "cat videos" -2000s')
-    expected = {:query => [{:clause => {:term => 'awesome'}},
+    tree = HeuristicParser::QueryParser.new.parse('+paw-some "cat videos" -2000s')
+    expected = {:query => [{:clause => {:operator => '+', :term => 'paw-some'}},
                            {:clause => {:phrase => [{:term => 'cat'}, {:term => 'videos'}]}},
                            {:clause => {:operator => '-', :decade => '2000'}}]}
 
     assert_equal(expected, tree)
   end
+
+  def test_term_prefixed_with_decade
+    tree = HeuristicParser::QueryParser.new.parse('2000st')
+    expected = {:query => [{:clause => {:term => '2000st'}}]}
+    assert_equal(expected, tree)
+  end
+
+  def test_term_suffixed_with_decade
+    tree = HeuristicParser::QueryParser.new.parse('st2000')
+    expected = {:query => [{:clause => {:term => 'st2000'}}]}
+    assert_equal(expected, tree)
+  end
+
+  def test_non_decade_parsed_as_term
+    tree = HeuristicParser::QueryParser.new.parse('2001')
+    expected = {:query => [{:clause => {:term => '2001'}}]}
+    assert_equal(expected, tree)
+  end
 end
diff --git a/tests/phrase_parser_tests.rb b/tests/phrase_parser_tests.rb
@@ -69,4 +69,26 @@ def test_complex_query
 
     assert_equal(expected, tree)
   end
+
+  def test_mismatched_quotation_marks
+    assert_raises Parslet::ParseFailed do
+      PhraseParser::QueryParser.new.parse('"foo')
+    end
+  end
+
+  def test_quotation_mark_in_term
+    assert_raises Parslet::ParseFailed do
+      PhraseParser::QueryParser.new.parse('fo"o')
+    end
+  end
+
+  def test_mismatched_quotation_mark_delimiter
+    # We'll call this a "feature" since the quotation marks are balanced.
+    # If you don't want this, you can use lookahead to ensure end-quote is followed by a space or EOF
+    tree = PhraseParser::QueryParser.new.parse('"foo"+bar"baz"')
+    expected = {:query => [{:clause => {:phrase => [{:term => 'foo'}]}},
+                           {:clause => {:operator => '+', :term => 'bar'}},
+                           {:clause => {:phrase => [{:term => 'baz'}]}}]}
+    assert_equal(expected, tree)
+  end
 end
diff --git a/tests/term_parser_tests.rb b/tests/term_parser_tests.rb
@@ -1,3 +1,4 @@
+# coding: utf-8
 require 'minitest/autorun'
 require_relative '../term_parser'
 
@@ -16,4 +17,14 @@ def test_multiple_spaces_between_terms
     tree = TermParser::QueryParser.new.parse('foo    bar')
     assert_equal({:query => [{:term => 'foo'}, {:term => 'bar'}]}, tree)
   end
+
+  def test_non_ascii_characters
+    tree = TermParser::QueryParser.new.parse('føé ba∑')
+    assert_equal({:query => [{:term => 'føé'}, {:term => 'ba∑'}]}, tree)
+  end
+
+  def test_quotation_marks
+    tree = TermParser::QueryParser.new.parse('fo"o')
+    assert_equal({:query => [{:term => 'fo"o'}]}, tree)
+  end
 end
diff --git a/tutorial/build_a_query_parser.md b/tutorial/build_a_query_parser.md
@@ -324,7 +324,7 @@ Extra input after last repetition at line 1 char 6.
 
 The simple parser above can recognize strings that match the grammar, but can't do anything with it. Using `#as`, we can capture parts of the input that we want to keep and save them as a parse tree. Anything not named with `#as` is discarded.
 
-We need to capture the terms and the overall query.
+We need to capture the terms and the overall query. To be more flexible with user input, the `term` rule has been updated to match any non-whitespace character.
 
     {{code="term_parser.rb:6-11"}}
 
@@ -574,9 +574,9 @@ Where `decade` is defined as:
 
 To implement this, we add the new `decade` rule to the parser and use it in the `clause` rule.
 
-    {{code="heuristic_parser.rb:7-21"}}
+    {{code="heuristic_parser.rb:7-23"}}
 
-A PEG parser always takes the first alternative, so we need to make `decade` match before `term`, because a `decade` is always a valid `term`. If we didn't do this, the `decade` rule would never match.
+A PEG parser always takes the first alternative, so we need to make `decade` match before `term`, because a `decade` is always a valid `term`. If we didn't do this, the `decade` rule would never match. We also need to add a lookahead for space or end of input to the `decade` rule. Without this, input like <span class="query-string">1990th</span> would be parsed as a decade `1990` and a term `th`.
 
 For the transformer, we define a `DateRangeClause` class that takes a number and converts it into a start and end date: