-
Notifications
You must be signed in to change notification settings - Fork 78
Compare DataFrames #1556
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Compare DataFrames #1556
Changes from 20 commits
9580248
eeee3f3
5fd2bfa
fccaaf6
bcc41e0
51e2b23
f4b21e3
ce46f72
b25c712
026ffe6
2de3ad1
352bf45
0db34d8
3387489
e052646
4db19bf
b4be510
e7b648f
483163d
1ebe865
c35ef21
f85a2f7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,221 @@ | ||
| package org.jetbrains.kotlinx.dataframe.impl.api | ||
|
|
||
| import org.jetbrains.kotlinx.dataframe.DataFrame | ||
| import org.jetbrains.kotlinx.dataframe.annotations.DataSchema | ||
| import org.jetbrains.kotlinx.dataframe.api.DataRowSchema | ||
| import org.jetbrains.kotlinx.dataframe.api.concat | ||
| import org.jetbrains.kotlinx.dataframe.api.dataFrameOf | ||
| import org.jetbrains.kotlinx.dataframe.api.emptyDataFrame | ||
| import org.jetbrains.kotlinx.dataframe.nrow | ||
|
|
||
| @DataSchema | ||
| internal class ComparisonDescription( | ||
| val rowAtIndex: Int, | ||
| val of: DataFrameOfComparison, | ||
| val wasRemoved: RowOfComparison?, | ||
| val insertedAfterRow: RowOfComparison?, | ||
| val afterRow: Int?, | ||
|
||
| ) : DataRowSchema | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if we could include the modified
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Myers difference algorithm exploits the idea of comparison in a boolean sense,
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Imo representing explicitly modified rows means customing Myers Alg logic by introducing a 3-possible-output non boolean logic of comparison: Equal, Non Equal, Similar. Similar means that compared row differ for a limited number of elements (that may be proportional to row's length).
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not (yet) asking for in-row differences. Simply for adding the original row to a new column when that row was "Not Equal", so either "removed" or "inserted". A Does that make sense?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, it would make the comparison output more independent. I can try to implement it.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Done, i added a |
||
|
|
||
| internal enum class DataFrameOfComparison { | ||
| DFA, | ||
| DFB, | ||
| } | ||
|
|
||
| internal enum class RowOfComparison { | ||
| WAS_INSERTED_AFTER_ROW, | ||
| WAS_REMOVED, | ||
| } | ||
|
|
||
| /** | ||
| * Returns a DataFrame whose rows explain the differences between dfA and dfB. | ||
| * One must think of the set of commands in a script as being executed simultaneously | ||
| */ | ||
| internal fun <T> compareDataFramesImpl(dfA: DataFrame<T>, dfB: DataFrame<T>): DataFrame<ComparisonDescription> { | ||
| var comparisonDf = emptyDataFrame<ComparisonDescription>() | ||
| // compare by exploiting Myers difference algorithm | ||
| val shortestEditScript = myersDifferenceAlgorithmImpl(dfA, dfB) | ||
| for (i in 1 until shortestEditScript.size) { | ||
| val x = shortestEditScript[i].first | ||
| val y = shortestEditScript[i].second | ||
| val xPrev = shortestEditScript[i - 1].first | ||
| val yPrev = shortestEditScript[i - 1].second | ||
| when { | ||
| // row at index 'x-1' of dfA was removed | ||
| xPrev + 1 == x && yPrev + 1 != y -> { | ||
| val indexOfRemovedRow = x - 1 | ||
| val sourceDfOfRemovedRow = DataFrameOfComparison.DFA | ||
| comparisonDf = comparisonDf.concat( | ||
| dataFrameOf( | ||
| ComparisonDescription( | ||
| indexOfRemovedRow, | ||
| sourceDfOfRemovedRow, | ||
| RowOfComparison.WAS_REMOVED, | ||
| null, | ||
| null, | ||
| ), | ||
| ), | ||
| ) | ||
| } | ||
|
|
||
| // row at index 'y-1' of dfB was inserted after row in position 'x-1' of dfA | ||
| yPrev + 1 == y && xPrev + 1 != x -> { | ||
| val indexOfInsertedRow = y - 1 | ||
| val sourceDfOfInsertedRow = DataFrameOfComparison.DFB | ||
| val indexOfReferenceRow = x - 1 | ||
| comparisonDf = comparisonDf.concat( | ||
| dataFrameOf( | ||
| ComparisonDescription( | ||
| indexOfInsertedRow, | ||
| sourceDfOfInsertedRow, | ||
| null, | ||
| RowOfComparison.WAS_INSERTED_AFTER_ROW, | ||
| indexOfReferenceRow, | ||
| ), | ||
| ), | ||
| ) | ||
| } | ||
| } | ||
| } | ||
| return comparisonDf | ||
| } | ||
|
|
||
| /** | ||
| * dfs with same schema. Returns an optimal path from origin to (N,M) in the edit graph. | ||
| * N is dfA.nrow, M is dfB.nrow. | ||
| * Knowing this path is knowing the differences between dfA and dfB | ||
| * and the shortest edit script to get B from A. | ||
| * The cost of this alg's worst case in O( (N+M)D ), D is the length of shortest edit script. | ||
| * | ||
| * The idea of the algorithm is the following: try to cross the edit graph making 'd' non-diagonal moves, | ||
| * increase 'd' until you succeed. | ||
| * Non-diagonal moves make edit script longer, while diagonal moves do not. | ||
| * | ||
| * snake: non-diagonal edge and then a possibly empty sequence of diagonal edges | ||
| * D-path: a path starting at (0,0) that has exactly D non-diagonal edges | ||
| */ | ||
| internal fun <T> myersDifferenceAlgorithmImpl(dfA: DataFrame<T>, dfB: DataFrame<T>): List<Pair<Int, Int>> { | ||
| // Return value | ||
| val path = mutableListOf<Pair<Int, Int>>() | ||
| // 'ses' stands for shortest edit script, next var is never returned, it is in the code | ||
| // to show the capabilities of the algorithm | ||
| var sesLength: Int | ||
| val sumOfLength = dfA.nrow + dfB.nrow | ||
| // matrix containing the endpoint of the furthest reaching D-path ending in diagonal k | ||
| // for each d-k couple of interest | ||
| val v = mutableListOf<IntArray>() | ||
| repeat(sumOfLength + 1) { | ||
| v.add(IntArray(sumOfLength * 2 + 1)) | ||
| } | ||
| var isOver = false | ||
| // starting the algorithm | ||
| // 0 position is -(M+N) position in the alg's paper -> need to normalize each access to v | ||
| val normalizer = sumOfLength | ||
| v[0][1 + normalizer] = 0 // fictitious | ||
| // d is the number of non-diagonal edges | ||
| var d = 0 | ||
| while (d <= sumOfLength && !isOver) { | ||
| for (k in -d..d step 2) { | ||
| var x: Int | ||
| // Each furthest reaching D-path ending in diagonal k | ||
| // is built by exploiting the furthest reaching (D-1)-path ending in k-1 or (exclusive or) k+1 | ||
| if (k == -d || k != d && v[d][k - 1 + normalizer] < v[d][k + 1 + normalizer]) { | ||
| x = v[d][k + 1 + normalizer] | ||
| } else { | ||
| x = v[d][k - 1 + normalizer] + 1 | ||
| } | ||
| var y = x - k | ||
| while (x < dfA.nrow && y < dfB.nrow && dfA[x] == dfB[y]) { | ||
| x += 1 | ||
| y += 1 | ||
| } | ||
| v[d][k + normalizer] = x | ||
| // need this data in the next iteration | ||
| if (d < sumOfLength) { | ||
| v[d + 1][k + normalizer] = x | ||
| } | ||
| // Edit graph was fully crossed | ||
| if (x >= dfA.nrow && y >= dfB.nrow) { | ||
| isOver = true | ||
| sesLength = d | ||
| tailrec(path, v, d, k, normalizer, dfA, dfB) | ||
| break | ||
| } | ||
| } | ||
| // try with a longer edit script | ||
| d++ | ||
| } | ||
| val immutablePath = path.toList() | ||
| return immutablePath | ||
| } | ||
|
|
||
| internal fun <T> tailrec( | ||
CarloMariaProietti marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| path: MutableList<Pair<Int, Int>>, | ||
| v: MutableList<IntArray>, | ||
| d: Int, | ||
| k: Int, | ||
| normalizer: Int, | ||
| dfA: DataFrame<T>, | ||
| dfB: DataFrame<T>, | ||
| ) { | ||
| // Enlist my self | ||
| val xCurrent = v[d][k + normalizer] | ||
| val yCurrent = xCurrent - k | ||
| path.add(Pair(xCurrent, yCurrent)) | ||
| // I look for endpoint I was built from, it is represented by kPrev. | ||
| // It will be an argument of the next recursive step. | ||
| // Moreover, I need to enlist the points composing the snake that precedes me (it may be empty). | ||
| if (d > 0) { | ||
| var kPrev: Int | ||
| var xSnake: Int | ||
| if (k == -d || k != d && v[d][k - 1 + normalizer] < v[d][k + 1 + normalizer]) { | ||
| kPrev = k + 1 | ||
| xSnake = v[d - 1][kPrev + normalizer] | ||
| } else { | ||
| kPrev = k - 1 | ||
| xSnake = v[d - 1][kPrev + normalizer] + 1 | ||
| } | ||
| var ySnake = xSnake - k | ||
| val snake = mutableListOf<Pair<Int, Int>>() | ||
| do { | ||
| snake.add(0, Pair(xSnake, ySnake)) | ||
| if (xSnake == xCurrent && ySnake == yCurrent) { | ||
| if (snake.isNotEmpty()) { | ||
| snake.removeFirst() | ||
| for (e in snake) { | ||
| path.add(e) | ||
| } | ||
| } | ||
| tailrec(path, v, d - 1, kPrev, normalizer, dfA, dfB) | ||
| return | ||
| } | ||
| if (xSnake < dfA.nrow && | ||
| ySnake < dfB.nrow && | ||
| xSnake >= 0 && | ||
| ySnake >= 0 && | ||
| dfA[xSnake] == dfB[ySnake] | ||
| ) { | ||
| xSnake += 1 | ||
| ySnake += 1 | ||
| } | ||
| } | ||
| while (xSnake <= xCurrent && ySnake <= yCurrent) | ||
| } | ||
| // Step base. | ||
| // Eventually need to add diagonal edges from origin to the furthest reaching point with d=0. | ||
| // Moreover, the path is reversed so that it can be read from left to right correctly | ||
| if (d == 0) { | ||
| if (path.last().first != 0 && path.last().second != 0) { | ||
| val last = path.last() | ||
| var x = last.first - 1 | ||
| var y = last.second - 1 | ||
| while (x >= 0 && y >= 0) { | ||
| path.add(Pair(x, y)) | ||
| x -= 1 | ||
| y -= 1 | ||
| } | ||
| } | ||
| path.reverse() | ||
| return | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.