-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit f8dfb08
Showing
11 changed files
with
419 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
# Files and directories created by pub. | ||
.dart_tool/ | ||
.packages | ||
|
||
# Conventional directory for build output. | ||
build/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
## 1.0.0 | ||
|
||
- Initial release |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
## A simple yet complete, lightweight and sturdy `robots.txt` ruleset parser to ensure your application follows the standard protocol. | ||
|
||
### Usage | ||
|
||
The following code gets the `robots.txt` robot exclusion ruleset of a website. | ||
|
||
`quietMode` determines whether or not the library should print warning messages in the case of the `robots.txt` not being valid or other errors. | ||
|
||
```dart | ||
// Create an instance of the `robots.txt` parser | ||
final robots = Robots('host', quietMode: true); | ||
// Read the ruleset of the website | ||
robots.read().then(() { | ||
for (final ruleset in robots.rulesets) { | ||
// Print the user-agent the ruleset applies to | ||
print(ruleset.appliesTo); | ||
print('Allows:'); | ||
// Print the path expressions allowed by this ruleset | ||
for (final rule in ruleset.allows) { | ||
print(' - ${rule expression}'); | ||
} | ||
// Print the path expressions disallowed by this ruleset | ||
for (final rule in ruleset.disallows) { | ||
print(' - ${rule expression}'); | ||
} | ||
} | ||
}); | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
include: package:words/core.yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
/// Lightweight, sturdy and fully documented library for parsing the | ||
/// `robots.txt` file. Nothing more, nothing less. | ||
library robots_txt; | ||
|
||
export 'src/parser.dart'; | ||
export 'src/rule.dart'; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
import 'package:sprint/sprint.dart'; | ||
import 'package:web_scraper/web_scraper.dart'; | ||
|
||
import 'package:robots_txt/src/rule.dart'; | ||
import 'package:robots_txt/src/ruleset.dart'; | ||
import 'package:robots_txt/src/utils.dart'; | ||
|
||
/// Abstracts away the rather convoluted declaration for an element with two | ||
// ignore: comment_references | ||
/// fields; [title] and [attributes]. [attributes] is a map containing the | ||
/// attributes of the element | ||
typedef Element = Map<String, Map<String, dynamic>>; | ||
|
||
/// Allows for parsing of a host's `robots.txt` to get information about which | ||
/// of its resources may or may not be accessed, as well as which of its pages | ||
/// cannot be traversed | ||
class Robots { | ||
/// Instance of `Sprint` message printer for the `robots.txt` parser | ||
final Sprint log; | ||
|
||
/// The host of this `robots.txt` file | ||
final String host; | ||
|
||
/// Stores an instance of the scraper for a given URL | ||
final WebScraper scraper; | ||
|
||
/// Stores expressions for both paths which may or may not be traversed | ||
final List<Ruleset> rulesets = []; | ||
|
||
/// Creates an instance of a `robots.txt` parser for the [host] | ||
Robots({ | ||
required this.host, | ||
bool quietMode = false, | ||
bool productionMode = true, | ||
}) : scraper = WebScraper(host), | ||
log = Sprint('Robots', | ||
quietMode: quietMode, productionMode: productionMode); | ||
|
||
/// Reads and parses the `robots.txt` file of the host | ||
Future read() async { | ||
await scraper.loadWebPage('/robots.txt'); | ||
final preformatted = scraper.getElement('pre', []); | ||
log.debug(preformatted); | ||
|
||
final invalidRobotsFileError = "'$host' has an invalid `robots.txt`:"; | ||
|
||
if (preformatted.isEmpty) { | ||
log.warn('$invalidRobotsFileError No text elements found'); | ||
return rulesets; | ||
} | ||
|
||
final attributes = preformatted[0]['attributes'] as Map<String, String>; | ||
log.debug(attributes); | ||
|
||
if (!attributes.containsKey('innerText') || | ||
attributes['innerText']!.isEmpty) { | ||
log.warn('$invalidRobotsFileError ' | ||
'The preformatted element does not contain text'); | ||
return rulesets; | ||
} | ||
|
||
final lines = attributes['innerText']!.split('\n'); | ||
return parseRuleset(lines); | ||
} | ||
|
||
/// Iterates over [lines] and parses each ruleset, additionally ignoring | ||
/// those rulesets which are not relevant to [onlyRelevantTo] | ||
List<Ruleset> parseRuleset(List<String> lines, {String? onlyRelevantTo}) { | ||
final rulesets = <Ruleset>[]; | ||
|
||
Ruleset? ruleset; | ||
for (var index = 0; index < lines.length; index++) { | ||
final field = getRobotsFieldFromLine(lines[index]); | ||
|
||
switch (field.key) { | ||
case 'user-agent': | ||
if (ruleset != null) { | ||
rulesets.add(ruleset); | ||
} | ||
if (onlyRelevantTo != null && field.key != onlyRelevantTo) { | ||
ruleset = null; | ||
break; | ||
} | ||
ruleset = Ruleset(field.value); | ||
break; | ||
|
||
case 'allow': | ||
if (ruleset != null) { | ||
final expression = convertFieldPathToExpression(field.value); | ||
ruleset.allows.add(Rule(expression, index)); | ||
} | ||
break; | ||
case 'disallow': | ||
if (ruleset != null) { | ||
final expression = convertFieldPathToExpression(field.value); | ||
ruleset.disallows.add(Rule(expression, index)); | ||
} | ||
break; | ||
} | ||
} | ||
|
||
log.debug( | ||
'Read robots.txt of $host: ${pluralise('ruleset', rulesets.length)}'); | ||
return rulesets; | ||
} | ||
|
||
/// Reads a path declaration from within `robots.txt` and converts it to a | ||
/// regular expression for later matching | ||
RegExp convertFieldPathToExpression(String pathDeclaration) { | ||
// Collapse duplicate slashes and wildcards into singles | ||
final collapsed = pathDeclaration | ||
..replaceAll('/+', '/') | ||
..replaceAll('*+', '*'); | ||
final normalised = collapsed.endsWith('*') | ||
? collapsed.substring(0, collapsed.length - 1) | ||
: collapsed; | ||
final withRegexWildcards = normalised | ||
..replaceAll('.', r'\.') | ||
..replaceAll('*', '.+'); | ||
final withTrailingText = withRegexWildcards.contains(r'$') | ||
? withRegexWildcards.split(r'$')[0] | ||
: '$withRegexWildcards.+'; | ||
return RegExp(withTrailingText); | ||
} | ||
|
||
/// Extracts the key and value from [target] and puts it into a `MapEntry` | ||
MapEntry<String, String> getRobotsFieldFromLine(String target) { | ||
final keyValuePair = target.split(':'); | ||
final key = keyValuePair[0].toLowerCase(); | ||
final value = keyValuePair.sublist(1).join(':').trim(); | ||
return MapEntry(key, value); | ||
} | ||
|
||
/// Determines whether or not [path] may be traversed | ||
bool canVisitPath(String path, {required String userAgent}) { | ||
final explicitAllowance = rulesets.getRule( | ||
appliesTo: userAgent, concernsPath: path, andAllowsIt: true); | ||
final explicitDisallowance = rulesets.getRule( | ||
appliesTo: userAgent, concernsPath: path, andAllowsIt: false); | ||
|
||
final allowancePriority = explicitAllowance?.priority ?? -1; | ||
final disallowancePriority = explicitDisallowance?.priority ?? -1; | ||
|
||
return allowancePriority >= disallowancePriority; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
/// A single rule (either `Allow` or `Disallow`) inside the `robots.txt` file | ||
class Rule { | ||
/// An expression which a path may be matched against to determine whether | ||
/// this rule applies to the path | ||
final RegExp expression; | ||
|
||
/// The priority of this rule based on its position inside the `robots.txt` | ||
/// file. If the path is determined to be relevant to two rules, the rule | ||
/// with the higher priority *overrides* the ruling of the other. | ||
final int priority; | ||
|
||
/// Instantiates a rule with an [expression] and the [priority] it has over | ||
/// other rules | ||
const Rule(this.expression, this.priority); | ||
} | ||
|
||
/// Extends `List<Rule>` with a method for getting the `Rule` with the highest | ||
/// [Rule.priority] | ||
extension RulingOnPath on List<Rule> { | ||
/// Taking [path], checks which `Rule`s' expressions match [path], and | ||
/// returns the `Rule` with the highest priority | ||
Rule? getRulingOnPath(String path) { | ||
final relevantRules = where((rule) => rule.expression.hasMatch(path)); | ||
if (relevantRules.isEmpty) { | ||
return null; | ||
} | ||
// Get the relevant rule with the highest priority | ||
return relevantRules.reduce((a, b) => a.priority > b.priority ? a : b); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import 'package:robots_txt/src/rule.dart'; | ||
|
||
/// A collection of `Rule`s, and the `user-agent` they are relevant to inside | ||
/// the `robots.txt` file | ||
class Ruleset { | ||
/// The `user-agent` which this ruleset applies to | ||
final String appliesTo; | ||
|
||
/// List of `Rule`s which explicitly state that a path may be traversed | ||
final List<Rule> allows = []; | ||
|
||
/// List of `Rule`s which explicitly state that a path may not be traversed | ||
final List<Rule> disallows = []; | ||
|
||
/// Instantiates a ruleset with the `user-agent` | ||
Ruleset(this.appliesTo); | ||
|
||
/// Checks whether this ruleset applies to [userAgent] | ||
bool doesConcern(String userAgent) => | ||
appliesTo == '*' || appliesTo == userAgent; | ||
} | ||
|
||
/// Extends `List<Ruleset>` with a method for getting a single `Rule` from the | ||
/// list of `Ruleset`s | ||
extension RulingOfRulesets on List<Ruleset> { | ||
/// Gets the rule which [appliesTo], [concernsPath] [andAllowsIt] | ||
Rule? getRule({ | ||
required String appliesTo, | ||
required String concernsPath, | ||
required bool andAllowsIt, | ||
}) => | ||
fold<Rule?>(null, (current, next) { | ||
if (!next.doesConcern(appliesTo)) { | ||
return current; | ||
} | ||
|
||
final currentPriority = current?.priority ?? -1; | ||
final relevantRules = andAllowsIt ? next.allows : next.disallows; | ||
final nextRule = relevantRules.getRulingOnPath(concernsPath); | ||
|
||
if (nextRule == null || nextRule.priority < currentPriority) { | ||
return current; | ||
} | ||
return nextRule; | ||
}); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
/// Taking the singular form of [word], morph it according to [count] | ||
String pluralise(String word, int count) => count > 1 ? '${word}s' : word; |
Oops, something went wrong.