Skip to content

Commit 604aa1b

Browse files
JoshRosencloud-fan
authored andcommitted
[SPARK-27786][SQL] Fix Sha1, Md5, and Base64 codegen when commons-codec is shaded
## What changes were proposed in this pull request? When running a custom build of Spark which shades `commons-codec`, the `Sha1` expression generates code which fails to compile: ``` org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator: failed to compile: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 47, Column 93: A method named "sha1Hex" is not declared in any enclosing class nor any supertype, nor through a static import ``` This is caused by an interaction between Spark's code generator and the shading: the current codegen template includes the string `org.apache.commons.codec.digest.DigestUtils.sha1Hex` as part of a larger string literal, preventing JarJarLinks from being able to replace the class name with the shaded class's name. As a result, the generated code still references the original unshaded class name name, triggering an error in case the original unshaded dependency isn't on the path. This problem impacts the `Sha1`, `Md5`, and `Base64` expressions. To fix this problem and allow for proper shading, this PR updates the codegen templates to replace the hardcoded class names with `${classof[<name>].getName}` calls. ## How was this patch tested? Existing tests. To ensure that I found all occurrences of this problem, I used IntelliJ's "Find in Path" to search for lines matching the regex `^(?!import|package).*(org|com|net|io)\.(?!apache\.spark)` and then filtered matches to inspect only non-test "Usage in string constants" cases. This isn't _perfect_ but I think it'll catch most cases. Closes apache#24655 from JoshRosen/fix-shaded-apache-commons. Authored-by: Josh Rosen <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent 5f4b505 commit 604aa1b

File tree

2 files changed

+9
-9
lines changed

2 files changed

+9
-9
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ case class Md5(child: Expression) extends UnaryExpression with ImplicitCastInput
6363

6464
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
6565
defineCodeGen(ctx, ev, c =>
66-
s"UTF8String.fromString(org.apache.commons.codec.digest.DigestUtils.md5Hex($c))")
66+
s"UTF8String.fromString(${classOf[DigestUtils].getName}.md5Hex($c))")
6767
}
6868
}
6969

@@ -120,7 +120,7 @@ case class Sha2(left: Expression, right: Expression)
120120
}
121121

122122
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
123-
val digestUtils = "org.apache.commons.codec.digest.DigestUtils"
123+
val digestUtils = classOf[DigestUtils].getName
124124
nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
125125
s"""
126126
if ($eval2 == 224) {
@@ -170,7 +170,7 @@ case class Sha1(child: Expression) extends UnaryExpression with ImplicitCastInpu
170170

171171
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
172172
defineCodeGen(ctx, ev, c =>
173-
s"UTF8String.fromString(org.apache.commons.codec.digest.DigestUtils.sha1Hex($c))"
173+
s"UTF8String.fromString(${classOf[DigestUtils].getName}.sha1Hex($c))"
174174
)
175175
}
176176
}

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ import java.util.regex.Pattern
2424

2525
import scala.collection.mutable.ArrayBuffer
2626

27+
import org.apache.commons.codec.binary.{Base64 => CommonsBase64}
28+
2729
import org.apache.spark.sql.catalyst.InternalRow
2830
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
2931
import org.apache.spark.sql.catalyst.expressions.codegen._
@@ -1829,15 +1831,13 @@ case class Base64(child: Expression) extends UnaryExpression with ImplicitCastIn
18291831
override def inputTypes: Seq[DataType] = Seq(BinaryType)
18301832

18311833
protected override def nullSafeEval(bytes: Any): Any = {
1832-
UTF8String.fromBytes(
1833-
org.apache.commons.codec.binary.Base64.encodeBase64(
1834-
bytes.asInstanceOf[Array[Byte]]))
1834+
UTF8String.fromBytes(CommonsBase64.encodeBase64(bytes.asInstanceOf[Array[Byte]]))
18351835
}
18361836

18371837
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
18381838
nullSafeCodeGen(ctx, ev, (child) => {
18391839
s"""${ev.value} = UTF8String.fromBytes(
1840-
org.apache.commons.codec.binary.Base64.encodeBase64($child));
1840+
${classOf[CommonsBase64].getName}.encodeBase64($child));
18411841
"""})
18421842
}
18431843
}
@@ -1859,12 +1859,12 @@ case class UnBase64(child: Expression) extends UnaryExpression with ImplicitCast
18591859
override def inputTypes: Seq[DataType] = Seq(StringType)
18601860

18611861
protected override def nullSafeEval(string: Any): Any =
1862-
org.apache.commons.codec.binary.Base64.decodeBase64(string.asInstanceOf[UTF8String].toString)
1862+
CommonsBase64.decodeBase64(string.asInstanceOf[UTF8String].toString)
18631863

18641864
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
18651865
nullSafeCodeGen(ctx, ev, (child) => {
18661866
s"""
1867-
${ev.value} = org.apache.commons.codec.binary.Base64.decodeBase64($child.toString());
1867+
${ev.value} = ${classOf[CommonsBase64].getName}.decodeBase64($child.toString());
18681868
"""})
18691869
}
18701870
}

0 commit comments

Comments
 (0)