Skip to content

Commit fb39d4f

Browse files
committed
🚀 Reformat parsers
* Reformatted parsers * Direct parsing * Scalable * Slightly faster parsing (no optimizations yet) Part-of: #134
1 parent 3853727 commit fb39d4f

32 files changed

+594
-528
lines changed
Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
package ua.acclorite.book_story.data.parser
22

3-
import ua.acclorite.book_story.domain.reader.ChapterWithText
4-
import ua.acclorite.book_story.domain.util.Resource
3+
import ua.acclorite.book_story.domain.reader.ReaderText
54
import java.io.File
65

76
interface TextParser {
8-
suspend fun parse(file: File): Resource<List<ChapterWithText>>
7+
suspend fun parse(file: File): List<ReaderText>
98
}

‎app/src/main/java/ua/acclorite/book_story/data/parser/TextParserImpl.kt

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,12 @@
11
package ua.acclorite.book_story.data.parser
22

33
import android.util.Log
4-
import ua.acclorite.book_story.R
54
import ua.acclorite.book_story.data.parser.epub.EpubTextParser
65
import ua.acclorite.book_story.data.parser.fb2.Fb2TextParser
76
import ua.acclorite.book_story.data.parser.html.HtmlTextParser
87
import ua.acclorite.book_story.data.parser.pdf.PdfTextParser
98
import ua.acclorite.book_story.data.parser.txt.TxtTextParser
10-
import ua.acclorite.book_story.domain.reader.ChapterWithText
11-
import ua.acclorite.book_story.domain.ui.UIText
12-
import ua.acclorite.book_story.domain.util.Resource
9+
import ua.acclorite.book_story.domain.reader.ReaderText
1310
import java.io.File
1411
import javax.inject.Inject
1512

@@ -20,14 +17,12 @@ class TextParserImpl @Inject constructor(
2017
private val pdfTextParser: PdfTextParser,
2118
private val epubTextParser: EpubTextParser,
2219
private val fb2TextParser: Fb2TextParser,
23-
private val htmlTextParser: HtmlTextParser,
20+
private val htmlTextParser: HtmlTextParser
2421
) : TextParser {
25-
override suspend fun parse(file: File): Resource<List<ChapterWithText>> {
22+
override suspend fun parse(file: File): List<ReaderText> {
2623
if (!file.exists()) {
2724
Log.e(TEXT_PARSER, "File does not exist.")
28-
return Resource.Error(
29-
UIText.StringResource(R.string.error_something_went_wrong_with_file)
30-
)
25+
return emptyList()
3126
}
3227

3328
val fileFormat = ".${file.extension}".lowercase().trim()
@@ -66,7 +61,7 @@ class TextParserImpl @Inject constructor(
6661

6762
else -> {
6863
Log.e(TEXT_PARSER, "Wrong file format, could not find supported extension.")
69-
Resource.Error(UIText.StringResource(R.string.error_wrong_file_format))
64+
emptyList()
7065
}
7166
}
7267
}

‎app/src/main/java/ua/acclorite/book_story/data/parser/epub/EpubTextParser.kt

Lines changed: 76 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,12 @@ import kotlinx.coroutines.coroutineScope
1212
import kotlinx.coroutines.withContext
1313
import kotlinx.coroutines.yield
1414
import org.jsoup.Jsoup
15-
import ua.acclorite.book_story.R
1615
import ua.acclorite.book_story.data.parser.DocumentParser
16+
import ua.acclorite.book_story.data.parser.MarkdownParser
1717
import ua.acclorite.book_story.data.parser.TextParser
18-
import ua.acclorite.book_story.domain.reader.Chapter
19-
import ua.acclorite.book_story.domain.reader.ChapterWithText
20-
import ua.acclorite.book_story.domain.ui.UIText
21-
import ua.acclorite.book_story.domain.util.Resource
18+
import ua.acclorite.book_story.domain.reader.ReaderText
2219
import ua.acclorite.book_story.presentation.core.util.addAll
20+
import ua.acclorite.book_story.presentation.core.util.clearAllMarkdown
2321
import ua.acclorite.book_story.presentation.core.util.clearMarkdown
2422
import java.io.File
2523
import java.util.concurrent.ConcurrentLinkedQueue
@@ -34,16 +32,16 @@ private typealias Title = String
3432
private val dispatcher = Dispatchers.IO.limitedParallelism(2)
3533

3634
class EpubTextParser @Inject constructor(
35+
private val markdownParser: MarkdownParser,
3736
private val documentParser: DocumentParser
3837
) : TextParser {
3938

40-
override suspend fun parse(file: File): Resource<List<ChapterWithText>> {
39+
override suspend fun parse(file: File): List<ReaderText> {
4140
Log.i(EPUB_TAG, "Started EPUB parsing: ${file.name}.")
4241

4342
return try {
44-
val chapters = mutableListOf<ChapterWithText>()
45-
4643
yield()
44+
var readerText = listOf<ReaderText>()
4745

4846
withContext(Dispatchers.IO) {
4947
ZipFile(file).use { zip ->
@@ -62,35 +60,28 @@ class EpubTextParser @Inject constructor(
6260
Log.i(EPUB_TAG, "Chapter entries, size: ${chapterEntries.size}")
6361
Log.i(EPUB_TAG, "Title entries, size: ${chapterTitleEntries?.size}")
6462

65-
zip.parseEpub(
63+
readerText = zip.parseEpub(
6664
chapterEntries = chapterEntries,
6765
chapterTitleEntries = chapterTitleEntries
68-
).let {
69-
if (it == null || it.isEmpty()) {
70-
Log.e(EPUB_TAG, "Could not parse EPUB (null or empty).")
71-
return@withContext
72-
}
73-
chapters.addAll(it)
74-
}
66+
)
7567
}
7668
}
7769

7870
yield()
7971

80-
if (chapters.isEmpty()) {
81-
return Resource.Error(UIText.StringResource(R.string.error_file_empty))
72+
if (
73+
readerText.filterIsInstance<ReaderText.Text>().isEmpty() ||
74+
readerText.filterIsInstance<ReaderText.Chapter>().isEmpty()
75+
) {
76+
Log.e(EPUB_TAG, "Could not extract text from EPUB.")
77+
return emptyList()
8278
}
8379

8480
Log.i(EPUB_TAG, "Successfully finished EPUB parsing.")
85-
Resource.Success(chapters)
81+
readerText
8682
} catch (e: Exception) {
8783
e.printStackTrace()
88-
Resource.Error(
89-
UIText.StringResource(
90-
R.string.error_query,
91-
e.message?.take(40)?.trim() ?: ""
92-
)
93-
)
84+
emptyList()
9485
}
9586
}
9687

@@ -107,18 +98,18 @@ class EpubTextParser @Inject constructor(
10798
private suspend fun ZipFile.parseEpub(
10899
chapterEntries: List<ZipEntry>,
109100
chapterTitleEntries: Map<Title, List<String>>?
110-
): List<ChapterWithText>? {
101+
): List<ReaderText> {
111102

112-
val chapters = mutableListOf<ChapterWithText>()
103+
val readerText = mutableListOf<ReaderText>()
113104
coroutineScope {
114-
val unformattedChapters = ConcurrentLinkedQueue<ChapterWithText>()
105+
val unformattedText = ConcurrentLinkedQueue<Pair<Int, List<ReaderText>>>()
115106

116107
// Asynchronously getting all chapters with text
117108
val jobs = chapterEntries.mapIndexed { index, entry ->
118109
async(dispatcher) {
119110
yield()
120111

121-
unformattedChapters.parseZipEntry(
112+
unformattedText.parseZipEntry(
122113
zip = this@parseEpub,
123114
index = index,
124115
entry = entry,
@@ -131,27 +122,15 @@ class EpubTextParser @Inject constructor(
131122
jobs.awaitAll()
132123

133124
// Sorting chapters in correct order
134-
chapters.addAll {
135-
var textIndex = -1
136-
unformattedChapters.toList()
137-
.sortedBy { it.chapter.index }
138-
.mapIndexed { index, item ->
139-
item.copy(
140-
chapter = item.chapter.copy(
141-
index = index,
142-
startIndex = textIndex + 1,
143-
endIndex = textIndex + item.text.size
144-
)
145-
).also { textIndex += item.text.size }
146-
}
125+
readerText.addAll {
126+
unformattedText.toList()
127+
.sortedBy { (index, _) -> index }
128+
.map { it.second }
129+
.flatten()
147130
}
148131
}
149132

150-
if (chapters.isEmpty()) {
151-
return null
152-
}
153-
154-
return chapters
133+
return readerText
155134
}
156135

157136
/**
@@ -163,54 +142,75 @@ class EpubTextParser @Inject constructor(
163142
* @param entry [ZipEntry].
164143
* @param chapterTitleMap Titles from [getChapterTitleMapFromToc].
165144
*/
166-
private suspend fun ConcurrentLinkedQueue<ChapterWithText>.parseZipEntry(
145+
private suspend fun ConcurrentLinkedQueue<Pair<Int, List<ReaderText>>>.parseZipEntry(
167146
zip: ZipFile,
168147
index: Int,
169148
entry: ZipEntry,
170149
chapterTitleMap: Map<Title, List<String>>?
171150
) {
172151
// Getting all text
173152
val content = zip.getInputStream(entry).bufferedReader().use { it.readText() }
174-
var chapter = documentParser.run {
153+
var text = documentParser.run {
175154
Jsoup.parse(content).parseDocument()
176155
}
156+
val readerText = mutableListOf<ReaderText>()
177157

178-
if (chapter.isEmpty()) {
179-
Log.w(EPUB_TAG, "Chapter ${entry.name} is empty.")
180-
return
181-
}
182-
183-
// Getting title and removing first line (if matches title)
184-
val chapterTitle = getChapterTitleFromToc(
158+
// Adding chapter title from TOC if found
159+
var chapterAdded = false
160+
getChapterTitleFromToc(
185161
chapterSource = entry.name,
186162
chapterTitleMap = chapterTitleMap
187-
).run {
188-
if (this != null) {
189-
return@run this
163+
).apply {
164+
if (this == null) return@apply
165+
readerText.add(
166+
ReaderText.Chapter(
167+
title = this
168+
)
169+
)
170+
chapterAdded = true
171+
172+
text = text.dropWhile { line ->
173+
line.clearMarkdown().lowercase() == this.lowercase()
190174
}
191-
chapter.first().clearMarkdown()
192-
}.also { title ->
193-
chapter = chapter.dropWhile { line ->
194-
line.clearMarkdown().lowercase() == title.lowercase()
175+
}
176+
177+
// Format and add text
178+
text.forEach { line ->
179+
yield()
180+
181+
if (line.isNotBlank()) {
182+
when (line) {
183+
"***", "---" -> readerText.add(
184+
ReaderText.Separator
185+
)
186+
187+
else -> {
188+
if (!chapterAdded && line.clearAllMarkdown().isNotBlank()) {
189+
readerText.add(
190+
0, ReaderText.Chapter(
191+
title = line.clearAllMarkdown()
192+
)
193+
)
194+
chapterAdded = true
195+
} else readerText.add(
196+
ReaderText.Text(
197+
line = markdownParser.parse(line)
198+
)
199+
)
200+
}
201+
}
195202
}
196203
}
197204

198-
if (chapter.isEmpty()) {
199-
Log.w(EPUB_TAG, "Chapter ${entry.name} is empty.")
205+
if (
206+
readerText.filterIsInstance<ReaderText.Text>().isEmpty() ||
207+
readerText.filterIsInstance<ReaderText.Chapter>().isEmpty()
208+
) {
209+
Log.w(EPUB_TAG, "Could not extract text from [${entry.name}].")
200210
return
201211
}
202212

203-
add(
204-
ChapterWithText(
205-
Chapter(
206-
index = index,
207-
title = chapterTitle,
208-
startIndex = 0,
209-
endIndex = 0
210-
),
211-
text = chapter
212-
)
213-
)
213+
add(index to readerText)
214214
}
215215

216216
/**

0 commit comments

Comments
 (0)