From f8dfb08d8ae3b290509b4615a33e780a65227335 Mon Sep 17 00:00:00 2001 From: vxern Date: Sun, 22 Aug 2021 12:58:20 +0100 Subject: [PATCH] Initial release: 1.0.0 --- .gitignore | 6 ++ CHANGELOG.md | 3 + README.md | 28 ++++++++ analysis_options.yaml | 1 + lib/robots_txt.dart | 6 ++ lib/src/parser.dart | 146 ++++++++++++++++++++++++++++++++++++++++++ lib/src/rule.dart | 30 +++++++++ lib/src/ruleset.dart | 46 +++++++++++++ lib/src/utils.dart | 2 + pubspec.lock | 131 +++++++++++++++++++++++++++++++++++++ pubspec.yaml | 20 ++++++ 11 files changed, 419 insertions(+) create mode 100644 .gitignore create mode 100644 CHANGELOG.md create mode 100644 README.md create mode 100644 analysis_options.yaml create mode 100644 lib/robots_txt.dart create mode 100644 lib/src/parser.dart create mode 100644 lib/src/rule.dart create mode 100644 lib/src/ruleset.dart create mode 100644 lib/src/utils.dart create mode 100644 pubspec.lock create mode 100644 pubspec.yaml diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3c8a157 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +# Files and directories created by pub. +.dart_tool/ +.packages + +# Conventional directory for build output. +build/ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..6bde737 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,3 @@ +## 1.0.0 + + - Initial release diff --git a/README.md b/README.md new file mode 100644 index 0000000..9891e4c --- /dev/null +++ b/README.md @@ -0,0 +1,28 @@ +## A simple yet complete, lightweight and sturdy `robots.txt` ruleset parser to ensure your application follows the standard protocol. + +### Usage + +The following code gets the `robots.txt` robot exclusion ruleset of a website. + +`quietMode` determines whether or not the library should print warning messages in the case of the `robots.txt` not being valid or other errors. + +```dart +// Create an instance of the `robots.txt` parser +final robots = Robots('host', quietMode: true); +// Read the ruleset of the website +robots.read().then(() { + for (final ruleset in robots.rulesets) { + // Print the user-agent the ruleset applies to + print(ruleset.appliesTo); + print('Allows:'); + // Print the path expressions allowed by this ruleset + for (final rule in ruleset.allows) { + print(' - ${rule expression}'); + } + // Print the path expressions disallowed by this ruleset + for (final rule in ruleset.disallows) { + print(' - ${rule expression}'); + } + } +}); +``` \ No newline at end of file diff --git a/analysis_options.yaml b/analysis_options.yaml new file mode 100644 index 0000000..e925772 --- /dev/null +++ b/analysis_options.yaml @@ -0,0 +1 @@ +include: package:words/core.yaml \ No newline at end of file diff --git a/lib/robots_txt.dart b/lib/robots_txt.dart new file mode 100644 index 0000000..6406f4f --- /dev/null +++ b/lib/robots_txt.dart @@ -0,0 +1,6 @@ +/// Lightweight, sturdy and fully documented library for parsing the +/// `robots.txt` file. Nothing more, nothing less. +library robots_txt; + +export 'src/parser.dart'; +export 'src/rule.dart'; diff --git a/lib/src/parser.dart b/lib/src/parser.dart new file mode 100644 index 0000000..dbf3824 --- /dev/null +++ b/lib/src/parser.dart @@ -0,0 +1,146 @@ +import 'package:sprint/sprint.dart'; +import 'package:web_scraper/web_scraper.dart'; + +import 'package:robots_txt/src/rule.dart'; +import 'package:robots_txt/src/ruleset.dart'; +import 'package:robots_txt/src/utils.dart'; + +/// Abstracts away the rather convoluted declaration for an element with two +// ignore: comment_references +/// fields; [title] and [attributes]. [attributes] is a map containing the +/// attributes of the element +typedef Element = Map>; + +/// Allows for parsing of a host's `robots.txt` to get information about which +/// of its resources may or may not be accessed, as well as which of its pages +/// cannot be traversed +class Robots { + /// Instance of `Sprint` message printer for the `robots.txt` parser + final Sprint log; + + /// The host of this `robots.txt` file + final String host; + + /// Stores an instance of the scraper for a given URL + final WebScraper scraper; + + /// Stores expressions for both paths which may or may not be traversed + final List rulesets = []; + + /// Creates an instance of a `robots.txt` parser for the [host] + Robots({ + required this.host, + bool quietMode = false, + bool productionMode = true, + }) : scraper = WebScraper(host), + log = Sprint('Robots', + quietMode: quietMode, productionMode: productionMode); + + /// Reads and parses the `robots.txt` file of the host + Future read() async { + await scraper.loadWebPage('/robots.txt'); + final preformatted = scraper.getElement('pre', []); + log.debug(preformatted); + + final invalidRobotsFileError = "'$host' has an invalid `robots.txt`:"; + + if (preformatted.isEmpty) { + log.warn('$invalidRobotsFileError No text elements found'); + return rulesets; + } + + final attributes = preformatted[0]['attributes'] as Map; + log.debug(attributes); + + if (!attributes.containsKey('innerText') || + attributes['innerText']!.isEmpty) { + log.warn('$invalidRobotsFileError ' + 'The preformatted element does not contain text'); + return rulesets; + } + + final lines = attributes['innerText']!.split('\n'); + return parseRuleset(lines); + } + + /// Iterates over [lines] and parses each ruleset, additionally ignoring + /// those rulesets which are not relevant to [onlyRelevantTo] + List parseRuleset(List lines, {String? onlyRelevantTo}) { + final rulesets = []; + + Ruleset? ruleset; + for (var index = 0; index < lines.length; index++) { + final field = getRobotsFieldFromLine(lines[index]); + + switch (field.key) { + case 'user-agent': + if (ruleset != null) { + rulesets.add(ruleset); + } + if (onlyRelevantTo != null && field.key != onlyRelevantTo) { + ruleset = null; + break; + } + ruleset = Ruleset(field.value); + break; + + case 'allow': + if (ruleset != null) { + final expression = convertFieldPathToExpression(field.value); + ruleset.allows.add(Rule(expression, index)); + } + break; + case 'disallow': + if (ruleset != null) { + final expression = convertFieldPathToExpression(field.value); + ruleset.disallows.add(Rule(expression, index)); + } + break; + } + } + + log.debug( + 'Read robots.txt of $host: ${pluralise('ruleset', rulesets.length)}'); + return rulesets; + } + + /// Reads a path declaration from within `robots.txt` and converts it to a + /// regular expression for later matching + RegExp convertFieldPathToExpression(String pathDeclaration) { + // Collapse duplicate slashes and wildcards into singles + final collapsed = pathDeclaration + ..replaceAll('/+', '/') + ..replaceAll('*+', '*'); + final normalised = collapsed.endsWith('*') + ? collapsed.substring(0, collapsed.length - 1) + : collapsed; + final withRegexWildcards = normalised + ..replaceAll('.', r'\.') + ..replaceAll('*', '.+'); + final withTrailingText = withRegexWildcards.contains(r'$') + ? withRegexWildcards.split(r'$')[0] + : '$withRegexWildcards.+'; + return RegExp(withTrailingText); + } + + /// Extracts the key and value from [target] and puts it into a `MapEntry` + MapEntry getRobotsFieldFromLine(String target) { + final keyValuePair = target.split(':'); + final key = keyValuePair[0].toLowerCase(); + final value = keyValuePair.sublist(1).join(':').trim(); + return MapEntry(key, value); + } + + /// Determines whether or not [path] may be traversed + bool canVisitPath(String path, {required String userAgent}) { + final explicitAllowance = rulesets.getRule( + appliesTo: userAgent, concernsPath: path, andAllowsIt: true); + final explicitDisallowance = rulesets.getRule( + appliesTo: userAgent, concernsPath: path, andAllowsIt: false); + + final allowancePriority = explicitAllowance?.priority ?? -1; + final disallowancePriority = explicitDisallowance?.priority ?? -1; + + return allowancePriority >= disallowancePriority; + } +} diff --git a/lib/src/rule.dart b/lib/src/rule.dart new file mode 100644 index 0000000..78a3215 --- /dev/null +++ b/lib/src/rule.dart @@ -0,0 +1,30 @@ +/// A single rule (either `Allow` or `Disallow`) inside the `robots.txt` file +class Rule { + /// An expression which a path may be matched against to determine whether + /// this rule applies to the path + final RegExp expression; + + /// The priority of this rule based on its position inside the `robots.txt` + /// file. If the path is determined to be relevant to two rules, the rule + /// with the higher priority *overrides* the ruling of the other. + final int priority; + + /// Instantiates a rule with an [expression] and the [priority] it has over + /// other rules + const Rule(this.expression, this.priority); +} + +/// Extends `List` with a method for getting the `Rule` with the highest +/// [Rule.priority] +extension RulingOnPath on List { + /// Taking [path], checks which `Rule`s' expressions match [path], and + /// returns the `Rule` with the highest priority + Rule? getRulingOnPath(String path) { + final relevantRules = where((rule) => rule.expression.hasMatch(path)); + if (relevantRules.isEmpty) { + return null; + } + // Get the relevant rule with the highest priority + return relevantRules.reduce((a, b) => a.priority > b.priority ? a : b); + } +} diff --git a/lib/src/ruleset.dart b/lib/src/ruleset.dart new file mode 100644 index 0000000..a83f142 --- /dev/null +++ b/lib/src/ruleset.dart @@ -0,0 +1,46 @@ +import 'package:robots_txt/src/rule.dart'; + +/// A collection of `Rule`s, and the `user-agent` they are relevant to inside +/// the `robots.txt` file +class Ruleset { + /// The `user-agent` which this ruleset applies to + final String appliesTo; + + /// List of `Rule`s which explicitly state that a path may be traversed + final List allows = []; + + /// List of `Rule`s which explicitly state that a path may not be traversed + final List disallows = []; + + /// Instantiates a ruleset with the `user-agent` + Ruleset(this.appliesTo); + + /// Checks whether this ruleset applies to [userAgent] + bool doesConcern(String userAgent) => + appliesTo == '*' || appliesTo == userAgent; +} + +/// Extends `List` with a method for getting a single `Rule` from the +/// list of `Ruleset`s +extension RulingOfRulesets on List { + /// Gets the rule which [appliesTo], [concernsPath] [andAllowsIt] + Rule? getRule({ + required String appliesTo, + required String concernsPath, + required bool andAllowsIt, + }) => + fold(null, (current, next) { + if (!next.doesConcern(appliesTo)) { + return current; + } + + final currentPriority = current?.priority ?? -1; + final relevantRules = andAllowsIt ? next.allows : next.disallows; + final nextRule = relevantRules.getRulingOnPath(concernsPath); + + if (nextRule == null || nextRule.priority < currentPriority) { + return current; + } + return nextRule; + }); +} diff --git a/lib/src/utils.dart b/lib/src/utils.dart new file mode 100644 index 0000000..e207f42 --- /dev/null +++ b/lib/src/utils.dart @@ -0,0 +1,2 @@ +/// Taking the singular form of [word], morph it according to [count] +String pluralise(String word, int count) => count > 1 ? '${word}s' : word; diff --git a/pubspec.lock b/pubspec.lock new file mode 100644 index 0000000..b5dbabb --- /dev/null +++ b/pubspec.lock @@ -0,0 +1,131 @@ +# Generated by pub +# See https://dart.dev/tools/pub/glossary#lockfile +packages: + ansicolor: + dependency: transitive + description: + name: ansicolor + url: "https://pub.dartlang.org" + source: hosted + version: "2.0.1" + async: + dependency: transitive + description: + name: async + url: "https://pub.dartlang.org" + source: hosted + version: "2.8.2" + charcode: + dependency: transitive + description: + name: charcode + url: "https://pub.dartlang.org" + source: hosted + version: "1.3.1" + collection: + dependency: transitive + description: + name: collection + url: "https://pub.dartlang.org" + source: hosted + version: "1.15.0" + csslib: + dependency: transitive + description: + name: csslib + url: "https://pub.dartlang.org" + source: hosted + version: "0.17.0" + html: + dependency: transitive + description: + name: html + url: "https://pub.dartlang.org" + source: hosted + version: "0.15.0" + http: + dependency: transitive + description: + name: http + url: "https://pub.dartlang.org" + source: hosted + version: "0.13.3" + http_parser: + dependency: transitive + description: + name: http_parser + url: "https://pub.dartlang.org" + source: hosted + version: "4.0.0" + meta: + dependency: transitive + description: + name: meta + url: "https://pub.dartlang.org" + source: hosted + version: "1.7.0" + path: + dependency: transitive + description: + name: path + url: "https://pub.dartlang.org" + source: hosted + version: "1.8.0" + pedantic: + dependency: transitive + description: + name: pedantic + url: "https://pub.dartlang.org" + source: hosted + version: "1.11.1" + source_span: + dependency: transitive + description: + name: source_span + url: "https://pub.dartlang.org" + source: hosted + version: "1.8.1" + sprint: + dependency: "direct main" + description: + name: sprint + url: "https://pub.dartlang.org" + source: hosted + version: "1.0.2+2" + string_scanner: + dependency: transitive + description: + name: string_scanner + url: "https://pub.dartlang.org" + source: hosted + version: "1.1.0" + term_glyph: + dependency: transitive + description: + name: term_glyph + url: "https://pub.dartlang.org" + source: hosted + version: "1.2.0" + typed_data: + dependency: transitive + description: + name: typed_data + url: "https://pub.dartlang.org" + source: hosted + version: "1.3.0" + web_scraper: + dependency: "direct main" + description: + name: web_scraper + url: "https://pub.dartlang.org" + source: hosted + version: "0.1.4" + words: + dependency: "direct dev" + description: + name: words + url: "https://pub.dartlang.org" + source: hosted + version: "0.0.1+2" +sdks: + dart: ">=2.13.0 <3.0.0" diff --git a/pubspec.yaml b/pubspec.yaml new file mode 100644 index 0000000..3d73876 --- /dev/null +++ b/pubspec.yaml @@ -0,0 +1,20 @@ +publish_to: https://pub.dev + +name: robots_txt +version: 1.0.0 + +description: Lightweight, sturdy and fully-documented parser of the `robots.txt` file. + +homepage: https://github.com/wordcollector/robots_txt +repository: https://github.com/wordcollector/robots_txt +issue_tracker: https://github.com/wordcollector/robots_txt/issues + +environment: + sdk: '>=2.13.0 <3.0.0' + +dependencies: + sprint: ^1.0.2+2 + web_scraper: ^0.1.4 + +dev_dependencies: + words: ^0.0.1+2 \ No newline at end of file