Class: SparkConnect::DataFrameReader

Inherits:

Object

Object
SparkConnect::DataFrameReader

show all

Defined in:: lib/spark_connect/reader.rb

Overview

Loads data from external sources into a DataFrame. Returned by SparkSession#read. Mirrors PySpark's DataFrameReader.

Examples:

spark.read.format("csv").option("header", true).load("data.csv")
spark.read.json("events.json")
spark.read.table("my_table")

Constant Summary collapse

Proto =

SparkConnect::Proto

Instance Method Summary collapse

#csv(*paths) ⇒ DataFrame
CSV at paths.
#format(source) ⇒ self
Set the input format ("csv", "json", "parquet", "orc", ...).
#initialize(session) ⇒ DataFrameReader constructor
A new instance of DataFrameReader.
#jdbc(url, table, properties = {}) ⇒ DataFrame
Read from a JDBC source.
#json(*paths) ⇒ DataFrame
JSON at paths.
#load(*paths) ⇒ DataFrame
Load data from the given path(s) using the configured format.
#option(key, value) ⇒ self
Set a single read option.
#options(opts) ⇒ self
Set multiple read options.
#orc(*paths) ⇒ DataFrame
ORC at paths.
#parquet(*paths) ⇒ DataFrame
Parquet at paths.
#schema(schema) ⇒ self
Set the input schema (a Types::StructType or DDL string).
#table(name) ⇒ DataFrame
Read a registered table or view.
#text(*paths) ⇒ DataFrame
Text at paths (one value column per line).

Constructor Details

#initialize(session) ⇒ `DataFrameReader`

Returns a new instance of DataFrameReader.

Parameters:

session (SparkSession)

# File 'lib/spark_connect/reader.rb', line 15

def initialize(session)
  @session = session
  @format = nil
  @schema = nil
  @options = {}
end

Instance Method Details

#csv(*paths) ⇒ `DataFrame`

Returns CSV at paths.

Returns:

(DataFrame) —
CSV at paths.



71
72

# File 'lib/spark_connect/reader.rb', line 71

def csv(*paths) = format("csv").load(*paths)
# @return [DataFrame] JSON at `paths`.

#format(source) ⇒ `self`

Set the input format ("csv", "json", "parquet", "orc", ...).

Returns:

(self)

# File 'lib/spark_connect/reader.rb', line 24

def format(source)
  @format = source.to_s
  self
end

#jdbc(url, table, properties = {}) ⇒ `DataFrame`

Read from a JDBC source.

Parameters:

url (String) —
the JDBC URL.
table (String) —
the table name (or subquery).
properties (Hash) (defaults to: {}) —
connection properties (user, password, ...).

Returns:

(DataFrame)

# File 'lib/spark_connect/reader.rb', line 87

def jdbc(url, table, properties = {})
  opts = { "url" => url, "dbtable" => table }.merge(properties.transform_keys(&:to_s))
  format("jdbc").options(opts).load
end

#json(*paths) ⇒ `DataFrame`

Returns JSON at paths.

Returns:

(DataFrame) —
JSON at paths.



73
74

# File 'lib/spark_connect/reader.rb', line 73

def json(*paths) = format("json").load(*paths)
# @return [DataFrame] Parquet at `paths`.

#load(*paths) ⇒ `DataFrame`

Load data from the given path(s) using the configured format.

Parameters:

paths (Array<String>)

Returns:

(DataFrame)

# File 'lib/spark_connect/reader.rb', line 54

def load(*paths)
  ds = Proto::Read::DataSource.new(options: @options, paths: paths.flatten.map(&:to_s))
  ds.format = @format if @format
  ds.schema = @schema if @schema
  read_relation(data_source: ds)
end

#option(key, value) ⇒ `self`

Set a single read option.

Returns:

(self)

# File 'lib/spark_connect/reader.rb', line 38

def option(key, value)
  @options[key.to_s] = value.to_s
  self
end

#options(opts) ⇒ `self`

Set multiple read options.

Returns:

(self)

# File 'lib/spark_connect/reader.rb', line 45

def options(opts)
  opts.each { |k, v| @options[k.to_s] = v.to_s }
  self
end

#orc(*paths) ⇒ `DataFrame`

Returns ORC at paths.

Returns:

(DataFrame) —
ORC at paths.



77
78

# File 'lib/spark_connect/reader.rb', line 77

def orc(*paths) = format("orc").load(*paths)
# @return [DataFrame] text at `paths` (one `value` column per line).

#parquet(*paths) ⇒ `DataFrame`

Returns Parquet at paths.

Returns:

(DataFrame) —
Parquet at paths.



75
76

# File 'lib/spark_connect/reader.rb', line 75

def parquet(*paths) = format("parquet").load(*paths)
# @return [DataFrame] ORC at `paths`.

#schema(schema) ⇒ `self`

Set the input schema (a Types::StructType or DDL string).

Returns:

(self)

# File 'lib/spark_connect/reader.rb', line 31

def schema(schema)
  @schema = schema.is_a?(Types::StructType) ? schema.simple_string : schema.to_s
  self
end

#table(name) ⇒ `DataFrame`

Read a registered table or view.

Parameters:

name (String)

Returns:

(DataFrame)

# File 'lib/spark_connect/reader.rb', line 65

def table(name)
  nt = Proto::Read::NamedTable.new(unparsed_identifier: name.to_s, options: @options)
  read_relation(named_table: nt)
end

#text(*paths) ⇒ `DataFrame`

Returns text at paths (one value column per line).

Returns:

(DataFrame) —
text at paths (one value column per line).

79	# File 'lib/spark_connect/reader.rb', line 79 def text(paths) = format("text").load(paths)

Class: SparkConnect::DataFrameReader

Overview

Examples:

Constant Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(session) ⇒ DataFrameReader

Instance Method Details

#csv(*paths) ⇒ DataFrame

#format(source) ⇒ self

#jdbc(url, table, properties = {}) ⇒ DataFrame

#json(*paths) ⇒ DataFrame

#load(*paths) ⇒ DataFrame

#option(key, value) ⇒ self

#options(opts) ⇒ self

#orc(*paths) ⇒ DataFrame

#parquet(*paths) ⇒ DataFrame

#schema(schema) ⇒ self

#table(name) ⇒ DataFrame

#text(*paths) ⇒ DataFrame

#initialize(session) ⇒ `DataFrameReader`

#csv(*paths) ⇒ `DataFrame`

#format(source) ⇒ `self`

#jdbc(url, table, properties = {}) ⇒ `DataFrame`

#json(*paths) ⇒ `DataFrame`

#load(*paths) ⇒ `DataFrame`

#option(key, value) ⇒ `self`

#options(opts) ⇒ `self`

#orc(*paths) ⇒ `DataFrame`

#parquet(*paths) ⇒ `DataFrame`

#schema(schema) ⇒ `self`

#table(name) ⇒ `DataFrame`

#text(*paths) ⇒ `DataFrame`