Initial release: 1.0.0

vxern · Aug 22, 2021 · f8dfb08 · f8dfb08
commit f8dfb08
Show file tree

Hide file tree

Showing 11 changed files with 419 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,6 @@
+# Files and directories created by pub.
+.dart_tool/
+.packages
+
+# Conventional directory for build output.
+build/
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,3 @@
+## 1.0.0
+
+  - Initial release
diff --git a/README.md b/README.md
@@ -0,0 +1,28 @@
+## A simple yet complete, lightweight and sturdy `robots.txt` ruleset parser to ensure your application follows the standard protocol.
+
+### Usage
+
+The following code gets the `robots.txt` robot exclusion ruleset of a website.
+
+`quietMode` determines whether or not the library should print warning messages in the case of the `robots.txt` not being valid or other errors.
+
+```dart
+// Create an instance of the `robots.txt` parser
+final robots = Robots('host', quietMode: true);
+// Read the ruleset of the website
+robots.read().then(() {
+  for (final ruleset in robots.rulesets) {
+    // Print the user-agent the ruleset applies to 
+    print(ruleset.appliesTo);
+    print('Allows:');
+    // Print the path expressions allowed by this ruleset
+    for (final rule in ruleset.allows) {
+      print('  - ${rule  expression}');
+    }
+    // Print the path expressions disallowed by this ruleset
+    for (final rule in ruleset.disallows) {
+      print('  - ${rule  expression}');
+    }
+  }
+});
+```
diff --git a/analysis_options.yaml b/analysis_options.yaml
@@ -0,0 +1 @@
+include: package:words/core.yaml
diff --git a/lib/robots_txt.dart b/lib/robots_txt.dart
@@ -0,0 +1,6 @@
+/// Lightweight, sturdy and fully documented library for parsing the
+/// `robots.txt` file. Nothing more, nothing less.
+library robots_txt;
+
+export 'src/parser.dart';
+export 'src/rule.dart';
diff --git a/lib/src/parser.dart b/lib/src/parser.dart
@@ -0,0 +1,146 @@
+import 'package:sprint/sprint.dart';
+import 'package:web_scraper/web_scraper.dart';
+
+import 'package:robots_txt/src/rule.dart';
+import 'package:robots_txt/src/ruleset.dart';
+import 'package:robots_txt/src/utils.dart';
+
+/// Abstracts away the rather convoluted declaration for an element with two
+// ignore: comment_references
+/// fields; [title] and [attributes]. [attributes] is a map containing the
+/// attributes of the element
+typedef Element = Map<String, Map<String, dynamic>>;
+
+/// Allows for parsing of a host's `robots.txt` to get information about which
+/// of its resources may or may not be accessed, as well as which of its pages
+/// cannot be traversed
+class Robots {
+  /// Instance of `Sprint` message printer for the `robots.txt` parser
+  final Sprint log;
+
+  /// The host of this `robots.txt` file
+  final String host;
+
+  /// Stores an instance of the scraper for a given URL
+  final WebScraper scraper;
+
+  /// Stores expressions for both paths which may or may not be traversed
+  final List<Ruleset> rulesets = [];
+
+  /// Creates an instance of a `robots.txt` parser for the [host]
+  Robots({
+    required this.host,
+    bool quietMode = false,
+    bool productionMode = true,
+  })  : scraper = WebScraper(host),
+        log = Sprint('Robots',
+            quietMode: quietMode, productionMode: productionMode);
+
+  /// Reads and parses the `robots.txt` file of the host
+  Future read() async {
+    await scraper.loadWebPage('/robots.txt');
+    final preformatted = scraper.getElement('pre', []);
+    log.debug(preformatted);
+
+    final invalidRobotsFileError = "'$host' has an invalid `robots.txt`:";
+
+    if (preformatted.isEmpty) {
+      log.warn('$invalidRobotsFileError No text elements found');
+      return rulesets;
+    }
+
+    final attributes = preformatted[0]['attributes'] as Map<String, String>;
+    log.debug(attributes);
+
+    if (!attributes.containsKey('innerText') ||
+        attributes['innerText']!.isEmpty) {
+      log.warn('$invalidRobotsFileError '
+          'The preformatted element does not contain text');
+      return rulesets;
+    }
+
+    final lines = attributes['innerText']!.split('\n');
+    return parseRuleset(lines);
+  }
+
+  /// Iterates over [lines] and parses each ruleset, additionally ignoring
+  /// those rulesets which are not relevant to [onlyRelevantTo]
+  List<Ruleset> parseRuleset(List<String> lines, {String? onlyRelevantTo}) {
+    final rulesets = <Ruleset>[];
+
+    Ruleset? ruleset;
+    for (var index = 0; index < lines.length; index++) {
+      final field = getRobotsFieldFromLine(lines[index]);
+
+      switch (field.key) {
+        case 'user-agent':
+          if (ruleset != null) {
+            rulesets.add(ruleset);
+          }
+          if (onlyRelevantTo != null && field.key != onlyRelevantTo) {
+            ruleset = null;
+            break;
+          }
+          ruleset = Ruleset(field.value);
+          break;
+
+        case 'allow':
+          if (ruleset != null) {
+            final expression = convertFieldPathToExpression(field.value);
+            ruleset.allows.add(Rule(expression, index));
+          }
+          break;
+        case 'disallow':
+          if (ruleset != null) {
+            final expression = convertFieldPathToExpression(field.value);
+            ruleset.disallows.add(Rule(expression, index));
+          }
+          break;
+      }
+    }
+
+    log.debug(
+        'Read robots.txt of $host: ${pluralise('ruleset', rulesets.length)}');
+    return rulesets;
+  }
+
+  /// Reads a path declaration from within `robots.txt` and converts it to a
+  /// regular expression for later matching
+  RegExp convertFieldPathToExpression(String pathDeclaration) {
+    // Collapse duplicate slashes and wildcards into singles
+    final collapsed = pathDeclaration
+      ..replaceAll('/+', '/')
+      ..replaceAll('*+', '*');
+    final normalised = collapsed.endsWith('*')
+        ? collapsed.substring(0, collapsed.length - 1)
+        : collapsed;
+    final withRegexWildcards = normalised
+      ..replaceAll('.', r'\.')
+      ..replaceAll('*', '.+');
+    final withTrailingText = withRegexWildcards.contains(r'$')
+        ? withRegexWildcards.split(r'$')[0]
+        : '$withRegexWildcards.+';
+    return RegExp(withTrailingText);
+  }
+
+  /// Extracts the key and value from [target] and puts it into a `MapEntry`
+  MapEntry<String, String> getRobotsFieldFromLine(String target) {
+    final keyValuePair = target.split(':');
+    final key = keyValuePair[0].toLowerCase();
+    final value = keyValuePair.sublist(1).join(':').trim();
+    return MapEntry(key, value);
+  }
+
+  /// Determines whether or not [path] may be traversed
+  bool canVisitPath(String path, {required String userAgent}) {
+    final explicitAllowance = rulesets.getRule(
+        appliesTo: userAgent, concernsPath: path, andAllowsIt: true);
+    final explicitDisallowance = rulesets.getRule(
+        appliesTo: userAgent, concernsPath: path, andAllowsIt: false);
+
+    final allowancePriority = explicitAllowance?.priority ?? -1;
+    final disallowancePriority = explicitDisallowance?.priority ?? -1;
+
+    return allowancePriority >= disallowancePriority;
+  }
+}
diff --git a/lib/src/rule.dart b/lib/src/rule.dart
@@ -0,0 +1,30 @@
+/// A single rule (either `Allow` or `Disallow`) inside the `robots.txt` file
+class Rule {
+  /// An expression which a path may be matched against to determine whether
+  /// this rule applies to the path
+  final RegExp expression;
+
+  /// The priority of this rule based on its position inside the `robots.txt`
+  /// file. If the path is determined to be relevant to two rules, the rule
+  /// with the higher priority *overrides* the ruling of the other.
+  final int priority;
+
+  /// Instantiates a rule with an [expression] and the [priority] it has over
+  /// other rules
+  const Rule(this.expression, this.priority);
+}
+
+/// Extends `List<Rule>` with a method for getting the `Rule` with the highest
+/// [Rule.priority]
+extension RulingOnPath on List<Rule> {
+  /// Taking [path], checks which `Rule`s' expressions match [path], and
+  /// returns the `Rule` with the highest priority
+  Rule? getRulingOnPath(String path) {
+    final relevantRules = where((rule) => rule.expression.hasMatch(path));
+    if (relevantRules.isEmpty) {
+      return null;
+    }
+    // Get the relevant rule with the highest priority
+    return relevantRules.reduce((a, b) => a.priority > b.priority ? a : b);
+  }
+}
diff --git a/lib/src/ruleset.dart b/lib/src/ruleset.dart
@@ -0,0 +1,46 @@
+import 'package:robots_txt/src/rule.dart';
+
+/// A collection of `Rule`s, and the `user-agent` they are relevant to inside
+/// the `robots.txt` file
+class Ruleset {
+  /// The `user-agent` which this ruleset applies to
+  final String appliesTo;
+
+  /// List of `Rule`s which explicitly state that a path may be traversed
+  final List<Rule> allows = [];
+
+  /// List of `Rule`s which explicitly state that a path may not be traversed
+  final List<Rule> disallows = [];
+
+  /// Instantiates a ruleset with the `user-agent`
+  Ruleset(this.appliesTo);
+
+  /// Checks whether this ruleset applies to [userAgent]
+  bool doesConcern(String userAgent) =>
+      appliesTo == '*' || appliesTo == userAgent;
+}
+
+/// Extends `List<Ruleset>` with a method for getting a single `Rule` from the
+/// list of `Ruleset`s
+extension RulingOfRulesets on List<Ruleset> {
+  /// Gets the rule which [appliesTo], [concernsPath] [andAllowsIt]
+  Rule? getRule({
+    required String appliesTo,
+    required String concernsPath,
+    required bool andAllowsIt,
+  }) =>
+      fold<Rule?>(null, (current, next) {
+        if (!next.doesConcern(appliesTo)) {
+          return current;
+        }
+
+        final currentPriority = current?.priority ?? -1;
+        final relevantRules = andAllowsIt ? next.allows : next.disallows;
+        final nextRule = relevantRules.getRulingOnPath(concernsPath);
+
+        if (nextRule == null || nextRule.priority < currentPriority) {
+          return current;
+        }
+        return nextRule;
+      });
+}
diff --git a/lib/src/utils.dart b/lib/src/utils.dart
@@ -0,0 +1,2 @@
+/// Taking the singular form of [word], morph it according to [count]
+String pluralise(String word, int count) => count > 1 ? '${word}s' : word;
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		/// Taking the singular form of [word], morph it according to [count]
		String pluralise(String word, int count) => count > 1 ? '${word}s' : word;