Run Scala code, out ipynb
Status: PoC
Scalanb is not published yet.
// In build.sbt
// To use batch notebook, you need macro paradise plugin and additional compiler options.
addCompilerPlugin("org.scalamacros" % "paradise" % "2.1.0" cross CrossVersion.full)
scalacOptions += "-Yrangepos"
build.sbt
@Notebook
annotationmain
method is automatically generated).ipynb
is saved in ~/.scalanb/hist
(default)
import com.todesking.{scalanb => nb}
@nb.Notebook
class MyNotebook {
nb.markdown("# Example of scalanb")
// add more code here
}
and
$ sbt 'runMain MyNotebook'
See Example1.scala and its output
To specify history location, use --out
option.
$ sbt 'runMain MyNotebook --out=file:path=./hist/'
Use spark.Notebook
annotation
import com.todesking.{scalanb => nb}
@nb.spark.Notebook
class MyNotebook {
// spark session available here
val df = spark.read.csv("...")
// Show dataframe as HTML tables via `nb.show` method
df.nb.show(10)
}
$ sbt assembly # Make fatjar
$ spark-submit --class MyNotebook myapp.jar
Requirement: scalanb-spark
$ sbt 'runMain MyNotebook --out=hdfs:path=/tmp/hist/'
When --log
option enabled, realtime log available.
$ sbt 'runMain MyNotebook --log'
# .scalanb/hist/{TIME}_{NOTE_NAME}.log
[2018-08-21 21:46:48] > nb.setShowTimeMillis(100)
[2018-08-21 21:46:48] > nb.markdown("# Scalanb Example")
[2018-08-21 21:46:48] > val a = 1
[2018-08-21 21:46:48] > val b = 2
[2018-08-21 21:46:48] > a
[2018-08-21 21:46:48] => 1
[2018-08-21 21:46:48] > println(s"a = $a")
[2018-08-21 21:46:48] stdout: a = 1
import com.todesking.{scalanb => nb}
@nb.Notebook
class BigData {
val cp = nb.checkpoint
val rawLog = cp.nocache { loadData("data/raw.csv") }
val count = cp.cache(rawLog) { rawLog => rawLog.count() }
cp.unwrap(count) { count =>
println(s"count = $count")
}
val userId = 10
val theUsersLogs = cp.cache((rawLog, userId)) { case (rawLog, userId) =>
rawLog.where('user_id === userId)
}
cp.unwrap(theUsersLogs) { theUsersLogs =>
theUsersLogs.count()
theUsersLogs.show()
}
}
Cache is based on value’s ID.
ID calculated from
// ID: rawLog-{ loadData("data/raw.csv") }
val rawLog = cp.nocache { loadData("data/raw.csv") }
// ID: count-{ rawLog => rawLog.count() }(rawLog-{ loadData("data/raw.csv") })
val count = cp.cache(rawLog) { rawLog => rawLog.count() }
// Primitive values could be dependent value.
// ID: lit:10
val userId = 10
// ID: theUsersLogs-{ case (rawLog, userId) => rawLog.where('user_id === userId) }((rawLog-{ loadData("data/raw.csv") }, lit:10))
val theUsersLogs = cp.cache((rawLog, userId)) { case (rawLog, userId) =>
rawLog.where('user_id === userId)
}
Cache location could specified by --cache
option. Default is ~/.scalanb/cache/
--cache=file:path=/path/to/cache
--cache=hdfs:path=/path/to/cache # requires scalanb-spark
{root}/{namespace}/{name}
cache.json
: metadata(TODO)data
: Serialized data(Format is type specific)To integrate EvilPlot, use this snippet:
import com.cibo.evilplot.plot
import plot.aesthetics.DefaultTheme._
implicit val plotFormat = nb.Format[plot.Plot] { plot =>
val img = plot.render().asBufferedImage
val buf = new java.io.ByteArrayOutputStream()
val png = javax.imageio.ImageIO.write(img, "png", buf)
buf.close()
nb.Value.binary("image/png", buf.toByteArray)
}
And you can embed plot in notebook:
import com.cibo.evilplot.numeric.Point
val data = (0.0 until 1.0 by 0.02).map { v =>
(v, v * scala.util.Random.nextDouble)
}.toSeq
plot.LinePlot(data.map { case (x, y) => Point(x, y) })
.xAxis()
.yAxis()
.frame()
.xLabel("x")
.yLabel("y")