Skip to content

Commit

Permalink
Initial release: 1.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
vxern committed Aug 22, 2021
0 parents commit f8dfb08
Show file tree
Hide file tree
Showing 11 changed files with 419 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Files and directories created by pub.
.dart_tool/
.packages

# Conventional directory for build output.
build/
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
## 1.0.0

- Initial release
28 changes: 28 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
## A simple yet complete, lightweight and sturdy `robots.txt` ruleset parser to ensure your application follows the standard protocol.

### Usage

The following code gets the `robots.txt` robot exclusion ruleset of a website.

`quietMode` determines whether or not the library should print warning messages in the case of the `robots.txt` not being valid or other errors.

```dart
// Create an instance of the `robots.txt` parser
final robots = Robots('host', quietMode: true);
// Read the ruleset of the website
robots.read().then(() {
for (final ruleset in robots.rulesets) {
// Print the user-agent the ruleset applies to
print(ruleset.appliesTo);
print('Allows:');
// Print the path expressions allowed by this ruleset
for (final rule in ruleset.allows) {
print(' - ${rule expression}');
}
// Print the path expressions disallowed by this ruleset
for (final rule in ruleset.disallows) {
print(' - ${rule expression}');
}
}
});
```
1 change: 1 addition & 0 deletions analysis_options.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
include: package:words/core.yaml
6 changes: 6 additions & 0 deletions lib/robots_txt.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
/// Lightweight, sturdy and fully documented library for parsing the
/// `robots.txt` file. Nothing more, nothing less.
library robots_txt;

export 'src/parser.dart';
export 'src/rule.dart';
146 changes: 146 additions & 0 deletions lib/src/parser.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import 'package:sprint/sprint.dart';
import 'package:web_scraper/web_scraper.dart';

import 'package:robots_txt/src/rule.dart';
import 'package:robots_txt/src/ruleset.dart';
import 'package:robots_txt/src/utils.dart';

/// Abstracts away the rather convoluted declaration for an element with two
// ignore: comment_references
/// fields; [title] and [attributes]. [attributes] is a map containing the
/// attributes of the element
typedef Element = Map<String, Map<String, dynamic>>;

/// Allows for parsing of a host's `robots.txt` to get information about which
/// of its resources may or may not be accessed, as well as which of its pages
/// cannot be traversed
class Robots {
/// Instance of `Sprint` message printer for the `robots.txt` parser
final Sprint log;

/// The host of this `robots.txt` file
final String host;

/// Stores an instance of the scraper for a given URL
final WebScraper scraper;

/// Stores expressions for both paths which may or may not be traversed
final List<Ruleset> rulesets = [];

/// Creates an instance of a `robots.txt` parser for the [host]
Robots({
required this.host,
bool quietMode = false,
bool productionMode = true,
}) : scraper = WebScraper(host),
log = Sprint('Robots',
quietMode: quietMode, productionMode: productionMode);

/// Reads and parses the `robots.txt` file of the host
Future read() async {
await scraper.loadWebPage('/robots.txt');
final preformatted = scraper.getElement('pre', []);
log.debug(preformatted);

final invalidRobotsFileError = "'$host' has an invalid `robots.txt`:";

if (preformatted.isEmpty) {
log.warn('$invalidRobotsFileError No text elements found');
return rulesets;
}

final attributes = preformatted[0]['attributes'] as Map<String, String>;
log.debug(attributes);

if (!attributes.containsKey('innerText') ||
attributes['innerText']!.isEmpty) {
log.warn('$invalidRobotsFileError '
'The preformatted element does not contain text');
return rulesets;
}

final lines = attributes['innerText']!.split('\n');
return parseRuleset(lines);
}

/// Iterates over [lines] and parses each ruleset, additionally ignoring
/// those rulesets which are not relevant to [onlyRelevantTo]
List<Ruleset> parseRuleset(List<String> lines, {String? onlyRelevantTo}) {
final rulesets = <Ruleset>[];

Ruleset? ruleset;
for (var index = 0; index < lines.length; index++) {
final field = getRobotsFieldFromLine(lines[index]);

switch (field.key) {
case 'user-agent':
if (ruleset != null) {
rulesets.add(ruleset);
}
if (onlyRelevantTo != null && field.key != onlyRelevantTo) {
ruleset = null;
break;
}
ruleset = Ruleset(field.value);
break;

case 'allow':
if (ruleset != null) {
final expression = convertFieldPathToExpression(field.value);
ruleset.allows.add(Rule(expression, index));
}
break;
case 'disallow':
if (ruleset != null) {
final expression = convertFieldPathToExpression(field.value);
ruleset.disallows.add(Rule(expression, index));
}
break;
}
}

log.debug(
'Read robots.txt of $host: ${pluralise('ruleset', rulesets.length)}');
return rulesets;
}

/// Reads a path declaration from within `robots.txt` and converts it to a
/// regular expression for later matching
RegExp convertFieldPathToExpression(String pathDeclaration) {
// Collapse duplicate slashes and wildcards into singles
final collapsed = pathDeclaration
..replaceAll('/+', '/')
..replaceAll('*+', '*');
final normalised = collapsed.endsWith('*')
? collapsed.substring(0, collapsed.length - 1)
: collapsed;
final withRegexWildcards = normalised
..replaceAll('.', r'\.')
..replaceAll('*', '.+');
final withTrailingText = withRegexWildcards.contains(r'$')
? withRegexWildcards.split(r'$')[0]
: '$withRegexWildcards.+';
return RegExp(withTrailingText);
}

/// Extracts the key and value from [target] and puts it into a `MapEntry`
MapEntry<String, String> getRobotsFieldFromLine(String target) {
final keyValuePair = target.split(':');
final key = keyValuePair[0].toLowerCase();
final value = keyValuePair.sublist(1).join(':').trim();
return MapEntry(key, value);
}

/// Determines whether or not [path] may be traversed
bool canVisitPath(String path, {required String userAgent}) {
final explicitAllowance = rulesets.getRule(
appliesTo: userAgent, concernsPath: path, andAllowsIt: true);
final explicitDisallowance = rulesets.getRule(
appliesTo: userAgent, concernsPath: path, andAllowsIt: false);

final allowancePriority = explicitAllowance?.priority ?? -1;
final disallowancePriority = explicitDisallowance?.priority ?? -1;

return allowancePriority >= disallowancePriority;
}
}
30 changes: 30 additions & 0 deletions lib/src/rule.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/// A single rule (either `Allow` or `Disallow`) inside the `robots.txt` file
class Rule {
/// An expression which a path may be matched against to determine whether
/// this rule applies to the path
final RegExp expression;

/// The priority of this rule based on its position inside the `robots.txt`
/// file. If the path is determined to be relevant to two rules, the rule
/// with the higher priority *overrides* the ruling of the other.
final int priority;

/// Instantiates a rule with an [expression] and the [priority] it has over
/// other rules
const Rule(this.expression, this.priority);
}

/// Extends `List<Rule>` with a method for getting the `Rule` with the highest
/// [Rule.priority]
extension RulingOnPath on List<Rule> {
/// Taking [path], checks which `Rule`s' expressions match [path], and
/// returns the `Rule` with the highest priority
Rule? getRulingOnPath(String path) {
final relevantRules = where((rule) => rule.expression.hasMatch(path));
if (relevantRules.isEmpty) {
return null;
}
// Get the relevant rule with the highest priority
return relevantRules.reduce((a, b) => a.priority > b.priority ? a : b);
}
}
46 changes: 46 additions & 0 deletions lib/src/ruleset.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import 'package:robots_txt/src/rule.dart';

/// A collection of `Rule`s, and the `user-agent` they are relevant to inside
/// the `robots.txt` file
class Ruleset {
/// The `user-agent` which this ruleset applies to
final String appliesTo;

/// List of `Rule`s which explicitly state that a path may be traversed
final List<Rule> allows = [];

/// List of `Rule`s which explicitly state that a path may not be traversed
final List<Rule> disallows = [];

/// Instantiates a ruleset with the `user-agent`
Ruleset(this.appliesTo);

/// Checks whether this ruleset applies to [userAgent]
bool doesConcern(String userAgent) =>
appliesTo == '*' || appliesTo == userAgent;
}

/// Extends `List<Ruleset>` with a method for getting a single `Rule` from the
/// list of `Ruleset`s
extension RulingOfRulesets on List<Ruleset> {
/// Gets the rule which [appliesTo], [concernsPath] [andAllowsIt]
Rule? getRule({
required String appliesTo,
required String concernsPath,
required bool andAllowsIt,
}) =>
fold<Rule?>(null, (current, next) {
if (!next.doesConcern(appliesTo)) {
return current;
}

final currentPriority = current?.priority ?? -1;
final relevantRules = andAllowsIt ? next.allows : next.disallows;
final nextRule = relevantRules.getRulingOnPath(concernsPath);

if (nextRule == null || nextRule.priority < currentPriority) {
return current;
}
return nextRule;
});
}
2 changes: 2 additions & 0 deletions lib/src/utils.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
/// Taking the singular form of [word], morph it according to [count]
String pluralise(String word, int count) => count > 1 ? '${word}s' : word;
Loading

0 comments on commit f8dfb08

Please sign in to comment.