Skip to content

Commit

Permalink
Some small parser improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
Mitch528 committed Mar 25, 2016
1 parent ee271e3 commit 91cacb6
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 46 deletions.
10 changes: 7 additions & 3 deletions WebNovelConverter/Extensions/ElementExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,13 @@ public static IElement FirstWhereHasClass(
where e.HasAttribute("class")
where filter?.Invoke(e) ?? true
let names = e.GetAttribute("class").Split(' ')
from name in classes
where names.Any(p => p.Equals(name, StringComparison.OrdinalIgnoreCase))
orderby classes.IndexOf(name)
let a = (from cl in classes
from name in names
where cl.Equals(name, StringComparison.OrdinalIgnoreCase)
select cl).FirstOrDefault()
where a != null
let index = classes.IndexOf(a)
orderby index
select e).FirstOrDefault();
}

Expand Down
1 change: 0 additions & 1 deletion WebNovelConverter/MainForm.cs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ private void MainForm_Load(object sender, EventArgs e)
_sources.Add(new RoyalRoadLSource());
_sources.Add(new BakaTsukiSource());
_sources.Add(new BlogspotSource());
_sources.Add(new WuxiaWorldSource());
_sources.Add(new NovelsNaoSource());

websiteTypeComboBox.SelectedIndex = 0;
Expand Down
47 changes: 42 additions & 5 deletions WebNovelConverter/Sources/WordPressSource.cs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ public class WordPressSource : WebNovelSource
{
"entry-title",
"post-title",
"page-title"
"page-title",
"title-block"
};

protected readonly List<string> NextChapterNames = new List<string>
Expand All @@ -63,6 +64,17 @@ public class WordPressSource : WebNovelSource
"Next"
};


protected readonly List<string> NavigationNames = new List<string>
{
"Next Chapter",
"Next",
"Previous Chapter",
"Prev",
"Table of Contents",
"Index"
};

public WordPressSource() : base("WordPress")
{
}
Expand Down Expand Up @@ -123,8 +135,8 @@ public override async Task<WebNovelChapter> GetChapterAsync(ChapterLink link,

protected virtual WebNovelChapter ParseChapter(IElement rootElement, CancellationToken token = default(CancellationToken))
{
IElement element = rootElement.FirstWhereHasClass(PostClasses)
?? rootElement.Descendents<IElement>().FirstOrDefault(p => p.LocalName == "article");
IElement articleElement = rootElement.Descendents<IElement>().FirstOrDefault(p => p.LocalName == "article");
IElement element = rootElement.FirstWhereHasClass(PostClasses) ?? articleElement;

if (element != null)
RemoveBloat(element);
Expand All @@ -148,12 +160,14 @@ public override async Task<WebNovelChapter> GetChapterAsync(ChapterLink link,
chapterNameElement = chNameLinkElement;
}

IElement nextChapterElement = (from e in rootElement.Descendents<IElement>() ?? new List<IElement>()
IElement nextChapterElement = (from e in articleElement?.Descendents<IElement>() ?? rootElement.Descendents<IElement>()
where e.LocalName == "a"
let text = e.Text()
let a = NextChapterNames.FirstOrDefault(p => text.IndexOf(p, StringComparison.OrdinalIgnoreCase) >= 0)
where a != null || (e.HasAttribute("rel") && e.GetAttribute("rel") == "next")
orderby NextChapterNames.IndexOf(a)
let index = NextChapterNames.IndexOf(a)
let o = index >= 0 ? index : int.MaxValue
orderby o
select e).FirstOrDefault();

WebNovelChapter chapter = new WebNovelChapter();
Expand All @@ -165,6 +179,7 @@ orderby NextChapterNames.IndexOf(a)
if (element != null)
{
RemoveNavigation(element);
RemoveScriptStyleElements(element);

chapter.ChapterName = chapterNameElement?.Text()?.Trim();
chapter.Content = element.InnerHtml;
Expand Down Expand Up @@ -207,6 +222,28 @@ protected virtual void RemoveBloat(IElement element)

protected virtual void RemoveNavigation(IElement element)
{
var navElements = from e in element.Descendents<IElement>()
where e.LocalName == "a"
let text = e.Text()
where NavigationNames.Any(p => text.IndexOf(p, StringComparison.OrdinalIgnoreCase) >= 0)
select e;

foreach (IElement e in navElements.ToList())
{
e.Remove();
}
}

protected virtual void RemoveScriptStyleElements(IElement element)
{
var elements = from e in element.Descendents<IElement>()
where e.LocalName == "script" || e.LocalName == "style"
select e;

foreach (IElement e in elements.ToList())
{
e.Remove();
}
}
}
}
37 changes: 0 additions & 37 deletions WebNovelConverter/Sources/WuxiaWorldSource.cs

This file was deleted.

0 comments on commit 91cacb6

Please sign in to comment.