From f8dfb08d8ae3b290509b4615a33e780a65227335 Mon Sep 17 00:00:00 2001
From: vxern <vxern@wordcollector.co.uk>
Date: Sun, 22 Aug 2021 12:58:20 +0100
Subject: [PATCH] Initial release: 1.0.0

---
 .gitignore            |   6 ++
 CHANGELOG.md          |   3 +
 README.md             |  28 ++++++++
 analysis_options.yaml |   1 +
 lib/robots_txt.dart   |   6 ++
 lib/src/parser.dart   | 146 ++++++++++++++++++++++++++++++++++++++++++
 lib/src/rule.dart     |  30 +++++++++
 lib/src/ruleset.dart  |  46 +++++++++++++
 lib/src/utils.dart    |   2 +
 pubspec.lock          | 131 +++++++++++++++++++++++++++++++++++++
 pubspec.yaml          |  20 ++++++
 11 files changed, 419 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 CHANGELOG.md
 create mode 100644 README.md
 create mode 100644 analysis_options.yaml
 create mode 100644 lib/robots_txt.dart
 create mode 100644 lib/src/parser.dart
 create mode 100644 lib/src/rule.dart
 create mode 100644 lib/src/ruleset.dart
 create mode 100644 lib/src/utils.dart
 create mode 100644 pubspec.lock
 create mode 100644 pubspec.yaml

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..3c8a157
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+# Files and directories created by pub.
+.dart_tool/
+.packages
+
+# Conventional directory for build output.
+build/
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..6bde737
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,3 @@
+## 1.0.0
+
+  - Initial release
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..9891e4c
--- /dev/null
+++ b/README.md
@@ -0,0 +1,28 @@
+## A simple yet complete, lightweight and sturdy `robots.txt` ruleset parser to ensure your application follows the standard protocol.
+
+### Usage
+
+The following code gets the `robots.txt` robot exclusion ruleset of a website.
+
+`quietMode` determines whether or not the library should print warning messages in the case of the `robots.txt` not being valid or other errors.
+
+```dart
+// Create an instance of the `robots.txt` parser
+final robots = Robots('host', quietMode: true);
+// Read the ruleset of the website
+robots.read().then(() {
+  for (final ruleset in robots.rulesets) {
+    // Print the user-agent the ruleset applies to 
+    print(ruleset.appliesTo);
+    print('Allows:');
+    // Print the path expressions allowed by this ruleset
+    for (final rule in ruleset.allows) {
+      print('  - ${rule  expression}');
+    }
+    // Print the path expressions disallowed by this ruleset
+    for (final rule in ruleset.disallows) {
+      print('  - ${rule  expression}');
+    }
+  }
+});
+```
\ No newline at end of file
diff --git a/analysis_options.yaml b/analysis_options.yaml
new file mode 100644
index 0000000..e925772
--- /dev/null
+++ b/analysis_options.yaml
@@ -0,0 +1 @@
+include: package:words/core.yaml
\ No newline at end of file
diff --git a/lib/robots_txt.dart b/lib/robots_txt.dart
new file mode 100644
index 0000000..6406f4f
--- /dev/null
+++ b/lib/robots_txt.dart
@@ -0,0 +1,6 @@
+/// Lightweight, sturdy and fully documented library for parsing the
+/// `robots.txt` file. Nothing more, nothing less.
+library robots_txt;
+
+export 'src/parser.dart';
+export 'src/rule.dart';
diff --git a/lib/src/parser.dart b/lib/src/parser.dart
new file mode 100644
index 0000000..dbf3824
--- /dev/null
+++ b/lib/src/parser.dart
@@ -0,0 +1,146 @@
+import 'package:sprint/sprint.dart';
+import 'package:web_scraper/web_scraper.dart';
+
+import 'package:robots_txt/src/rule.dart';
+import 'package:robots_txt/src/ruleset.dart';
+import 'package:robots_txt/src/utils.dart';
+
+/// Abstracts away the rather convoluted declaration for an element with two
+// ignore: comment_references
+/// fields; [title] and [attributes]. [attributes] is a map containing the
+/// attributes of the element
+typedef Element = Map<String, Map<String, dynamic>>;
+
+/// Allows for parsing of a host's `robots.txt` to get information about which
+/// of its resources may or may not be accessed, as well as which of its pages
+/// cannot be traversed
+class Robots {
+  /// Instance of `Sprint` message printer for the `robots.txt` parser
+  final Sprint log;
+
+  /// The host of this `robots.txt` file
+  final String host;
+
+  /// Stores an instance of the scraper for a given URL
+  final WebScraper scraper;
+
+  /// Stores expressions for both paths which may or may not be traversed
+  final List<Ruleset> rulesets = [];
+
+  /// Creates an instance of a `robots.txt` parser for the [host]
+  Robots({
+    required this.host,
+    bool quietMode = false,
+    bool productionMode = true,
+  })  : scraper = WebScraper(host),
+        log = Sprint('Robots',
+            quietMode: quietMode, productionMode: productionMode);
+
+  /// Reads and parses the `robots.txt` file of the host
+  Future read() async {
+    await scraper.loadWebPage('/robots.txt');
+    final preformatted = scraper.getElement('pre', []);
+    log.debug(preformatted);
+
+    final invalidRobotsFileError = "'$host' has an invalid `robots.txt`:";
+
+    if (preformatted.isEmpty) {
+      log.warn('$invalidRobotsFileError No text elements found');
+      return rulesets;
+    }
+
+    final attributes = preformatted[0]['attributes'] as Map<String, String>;
+    log.debug(attributes);
+
+    if (!attributes.containsKey('innerText') ||
+        attributes['innerText']!.isEmpty) {
+      log.warn('$invalidRobotsFileError '
+          'The preformatted element does not contain text');
+      return rulesets;
+    }
+
+    final lines = attributes['innerText']!.split('\n');
+    return parseRuleset(lines);
+  }
+
+  /// Iterates over [lines] and parses each ruleset, additionally ignoring
+  /// those rulesets which are not relevant to [onlyRelevantTo]
+  List<Ruleset> parseRuleset(List<String> lines, {String? onlyRelevantTo}) {
+    final rulesets = <Ruleset>[];
+
+    Ruleset? ruleset;
+    for (var index = 0; index < lines.length; index++) {
+      final field = getRobotsFieldFromLine(lines[index]);
+
+      switch (field.key) {
+        case 'user-agent':
+          if (ruleset != null) {
+            rulesets.add(ruleset);
+          }
+          if (onlyRelevantTo != null && field.key != onlyRelevantTo) {
+            ruleset = null;
+            break;
+          }
+          ruleset = Ruleset(field.value);
+          break;
+
+        case 'allow':
+          if (ruleset != null) {
+            final expression = convertFieldPathToExpression(field.value);
+            ruleset.allows.add(Rule(expression, index));
+          }
+          break;
+        case 'disallow':
+          if (ruleset != null) {
+            final expression = convertFieldPathToExpression(field.value);
+            ruleset.disallows.add(Rule(expression, index));
+          }
+          break;
+      }
+    }
+
+    log.debug(
+        'Read robots.txt of $host: ${pluralise('ruleset', rulesets.length)}');
+    return rulesets;
+  }
+
+  /// Reads a path declaration from within `robots.txt` and converts it to a
+  /// regular expression for later matching
+  RegExp convertFieldPathToExpression(String pathDeclaration) {
+    // Collapse duplicate slashes and wildcards into singles
+    final collapsed = pathDeclaration
+      ..replaceAll('/+', '/')
+      ..replaceAll('*+', '*');
+    final normalised = collapsed.endsWith('*')
+        ? collapsed.substring(0, collapsed.length - 1)
+        : collapsed;
+    final withRegexWildcards = normalised
+      ..replaceAll('.', r'\.')
+      ..replaceAll('*', '.+');
+    final withTrailingText = withRegexWildcards.contains(r'$')
+        ? withRegexWildcards.split(r'$')[0]
+        : '$withRegexWildcards.+';
+    return RegExp(withTrailingText);
+  }
+
+  /// Extracts the key and value from [target] and puts it into a `MapEntry`
+  MapEntry<String, String> getRobotsFieldFromLine(String target) {
+    final keyValuePair = target.split(':');
+    final key = keyValuePair[0].toLowerCase();
+    final value = keyValuePair.sublist(1).join(':').trim();
+    return MapEntry(key, value);
+  }
+
+  /// Determines whether or not [path] may be traversed
+  bool canVisitPath(String path, {required String userAgent}) {
+    final explicitAllowance = rulesets.getRule(
+        appliesTo: userAgent, concernsPath: path, andAllowsIt: true);
+    final explicitDisallowance = rulesets.getRule(
+        appliesTo: userAgent, concernsPath: path, andAllowsIt: false);
+
+    final allowancePriority = explicitAllowance?.priority ?? -1;
+    final disallowancePriority = explicitDisallowance?.priority ?? -1;
+
+    return allowancePriority >= disallowancePriority;
+  }
+}
diff --git a/lib/src/rule.dart b/lib/src/rule.dart
new file mode 100644
index 0000000..78a3215
--- /dev/null
+++ b/lib/src/rule.dart
@@ -0,0 +1,30 @@
+/// A single rule (either `Allow` or `Disallow`) inside the `robots.txt` file
+class Rule {
+  /// An expression which a path may be matched against to determine whether
+  /// this rule applies to the path
+  final RegExp expression;
+
+  /// The priority of this rule based on its position inside the `robots.txt`
+  /// file. If the path is determined to be relevant to two rules, the rule
+  /// with the higher priority *overrides* the ruling of the other.
+  final int priority;
+
+  /// Instantiates a rule with an [expression] and the [priority] it has over
+  /// other rules
+  const Rule(this.expression, this.priority);
+}
+
+/// Extends `List<Rule>` with a method for getting the `Rule` with the highest
+/// [Rule.priority]
+extension RulingOnPath on List<Rule> {
+  /// Taking [path], checks which `Rule`s' expressions match [path], and
+  /// returns the `Rule` with the highest priority
+  Rule? getRulingOnPath(String path) {
+    final relevantRules = where((rule) => rule.expression.hasMatch(path));
+    if (relevantRules.isEmpty) {
+      return null;
+    }
+    // Get the relevant rule with the highest priority
+    return relevantRules.reduce((a, b) => a.priority > b.priority ? a : b);
+  }
+}
diff --git a/lib/src/ruleset.dart b/lib/src/ruleset.dart
new file mode 100644
index 0000000..a83f142
--- /dev/null
+++ b/lib/src/ruleset.dart
@@ -0,0 +1,46 @@
+import 'package:robots_txt/src/rule.dart';
+
+/// A collection of `Rule`s, and the `user-agent` they are relevant to inside
+/// the `robots.txt` file
+class Ruleset {
+  /// The `user-agent` which this ruleset applies to
+  final String appliesTo;
+
+  /// List of `Rule`s which explicitly state that a path may be traversed
+  final List<Rule> allows = [];
+
+  /// List of `Rule`s which explicitly state that a path may not be traversed
+  final List<Rule> disallows = [];
+
+  /// Instantiates a ruleset with the `user-agent`
+  Ruleset(this.appliesTo);
+
+  /// Checks whether this ruleset applies to [userAgent]
+  bool doesConcern(String userAgent) =>
+      appliesTo == '*' || appliesTo == userAgent;
+}
+
+/// Extends `List<Ruleset>` with a method for getting a single `Rule` from the
+/// list of `Ruleset`s
+extension RulingOfRulesets on List<Ruleset> {
+  /// Gets the rule which [appliesTo], [concernsPath] [andAllowsIt]
+  Rule? getRule({
+    required String appliesTo,
+    required String concernsPath,
+    required bool andAllowsIt,
+  }) =>
+      fold<Rule?>(null, (current, next) {
+        if (!next.doesConcern(appliesTo)) {
+          return current;
+        }
+
+        final currentPriority = current?.priority ?? -1;
+        final relevantRules = andAllowsIt ? next.allows : next.disallows;
+        final nextRule = relevantRules.getRulingOnPath(concernsPath);
+
+        if (nextRule == null || nextRule.priority < currentPriority) {
+          return current;
+        }
+        return nextRule;
+      });
+}
diff --git a/lib/src/utils.dart b/lib/src/utils.dart
new file mode 100644
index 0000000..e207f42
--- /dev/null
+++ b/lib/src/utils.dart
@@ -0,0 +1,2 @@
+/// Taking the singular form of [word], morph it according to [count]
+String pluralise(String word, int count) => count > 1 ? '${word}s' : word;
diff --git a/pubspec.lock b/pubspec.lock
new file mode 100644
index 0000000..b5dbabb
--- /dev/null
+++ b/pubspec.lock
@@ -0,0 +1,131 @@
+# Generated by pub
+# See https://dart.dev/tools/pub/glossary#lockfile
+packages:
+  ansicolor:
+    dependency: transitive
+    description:
+      name: ansicolor
+      url: "https://pub.dartlang.org"
+    source: hosted
+    version: "2.0.1"
+  async:
+    dependency: transitive
+    description:
+      name: async
+      url: "https://pub.dartlang.org"
+    source: hosted
+    version: "2.8.2"
+  charcode:
+    dependency: transitive
+    description:
+      name: charcode
+      url: "https://pub.dartlang.org"
+    source: hosted
+    version: "1.3.1"
+  collection:
+    dependency: transitive
+    description:
+      name: collection
+      url: "https://pub.dartlang.org"
+    source: hosted
+    version: "1.15.0"
+  csslib:
+    dependency: transitive
+    description:
+      name: csslib
+      url: "https://pub.dartlang.org"
+    source: hosted
+    version: "0.17.0"
+  html:
+    dependency: transitive
+    description:
+      name: html
+      url: "https://pub.dartlang.org"
+    source: hosted
+    version: "0.15.0"
+  http:
+    dependency: transitive
+    description:
+      name: http
+      url: "https://pub.dartlang.org"
+    source: hosted
+    version: "0.13.3"
+  http_parser:
+    dependency: transitive
+    description:
+      name: http_parser
+      url: "https://pub.dartlang.org"
+    source: hosted
+    version: "4.0.0"
+  meta:
+    dependency: transitive
+    description:
+      name: meta
+      url: "https://pub.dartlang.org"
+    source: hosted
+    version: "1.7.0"
+  path:
+    dependency: transitive
+    description:
+      name: path
+      url: "https://pub.dartlang.org"
+    source: hosted
+    version: "1.8.0"
+  pedantic:
+    dependency: transitive
+    description:
+      name: pedantic
+      url: "https://pub.dartlang.org"
+    source: hosted
+    version: "1.11.1"
+  source_span:
+    dependency: transitive
+    description:
+      name: source_span
+      url: "https://pub.dartlang.org"
+    source: hosted
+    version: "1.8.1"
+  sprint:
+    dependency: "direct main"
+    description:
+      name: sprint
+      url: "https://pub.dartlang.org"
+    source: hosted
+    version: "1.0.2+2"
+  string_scanner:
+    dependency: transitive
+    description:
+      name: string_scanner
+      url: "https://pub.dartlang.org"
+    source: hosted
+    version: "1.1.0"
+  term_glyph:
+    dependency: transitive
+    description:
+      name: term_glyph
+      url: "https://pub.dartlang.org"
+    source: hosted
+    version: "1.2.0"
+  typed_data:
+    dependency: transitive
+    description:
+      name: typed_data
+      url: "https://pub.dartlang.org"
+    source: hosted
+    version: "1.3.0"
+  web_scraper:
+    dependency: "direct main"
+    description:
+      name: web_scraper
+      url: "https://pub.dartlang.org"
+    source: hosted
+    version: "0.1.4"
+  words:
+    dependency: "direct dev"
+    description:
+      name: words
+      url: "https://pub.dartlang.org"
+    source: hosted
+    version: "0.0.1+2"
+sdks:
+  dart: ">=2.13.0 <3.0.0"
diff --git a/pubspec.yaml b/pubspec.yaml
new file mode 100644
index 0000000..3d73876
--- /dev/null
+++ b/pubspec.yaml
@@ -0,0 +1,20 @@
+publish_to: https://pub.dev
+
+name: robots_txt
+version: 1.0.0
+
+description: Lightweight, sturdy and fully-documented parser of the `robots.txt` file.
+
+homepage: https://github.com/wordcollector/robots_txt
+repository: https://github.com/wordcollector/robots_txt
+issue_tracker: https://github.com/wordcollector/robots_txt/issues
+
+environment:
+  sdk: '>=2.13.0 <3.0.0'
+
+dependencies:
+  sprint: ^1.0.2+2
+  web_scraper: ^0.1.4
+
+dev_dependencies:
+  words: ^0.0.1+2
\ No newline at end of file