-
Dataframes in Haskell: the Frames library
2016-08-24
SourceThis article demonstrates the use of the
Frames
library. It has three parts, each one containing aghc
script. We’ll load the following module at the beginning of each script.-- Frames_init.hs {-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE DataKinds #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE TypeOperators #-} {-# LANGUAGE OverloadedStrings #-} module Init where import qualified Control.Foldl as L import qualified Data.Foldable as F import Lens.Family (view, set, over) import Frames import Frames.CSV (defaultParser, readTableOpt) import Pipes -- define a Show instance for frames instance (Show a) => Show (Frame a) where show (Frame l f) = (show $ f 0) ++ (if l>1 then "\n" ++ (show $ f 1) else "") ++ (if l>2 then "\n..." else "") ++ "\nFrame with " ++ (show l) ++ if(l>1) then " rows." else " row." -- - #~ Will be used in part 1 ~# - -- -- define new types for the records declareColumn "Name" ''String declareColumn "Value" ''Float declareColumn "Indic" ''Int declareColumn "Status" ''Bool -- - #~ Will be used in part 2 ~# - -- tableTypes "Iris" "iris_small.csv" irisStream :: Producer Iris IO () irisStream = readTableOpt defaultParser "iris_small.csv" loadIris :: IO (Frame Iris) loadIris = inCoreAoS irisStream -- - #~ Will be used in part 3 ~# - -- declareColumn "Ratio" ''Double declareColumn "MeanWidth" ''Double
Note that we have defined a
Show
instance for the dataframes, because there is none.Records
The first script below demonstrates how to define records and a dataframe by hand. In order to define these records, we used
declareColumn
in theInit
module. It creates new types and lenses.> :load Frames_init.hs > > -- - #~ Records ~# - -- > > -- define records > :set -XDataKinds > :{ > let row1 :: Record '[Name, Value, Indic, Status] > row1 = "Joe" &: 10.1 &: 0 &: True &: Nil > :} > :{ > let row2 :: Record '[Name, Value, Indic, Status] > row2 = "Bill" &: 9.0 &: 1 &: True &: Nil > :} > :{ > let row3 :: Record '[Name, Value, Indic, Status] > row3 = "Kate" &: -3.0 &: 2 &: False &: Nil > :} > > -- take a look > row1 {Name :-> "Joe", Value :-> 10.1, Indic :-> 0, Status :-> True} > > -- declareColumn has defined new lenses name, value, indic and status > :info name name :: (Functor f0, RElem Name rs0 (Data.Vinyl.TypeLevel.RIndex Name rs0)) => (String -> f0 String) -> Record rs0 -> f0 (Record rs0) -- Defined at Frames_init.hs:30:1 > > -- let's use them > view status row1 True > -- it's the same as > rget status row1 True > -- setter: > set name "Jim" row1 {Name :-> "Jim", Value :-> 10.1, Indic :-> 0, Status :-> True} > rput name "Jim" row1 {Name :-> "Jim", Value :-> 10.1, Indic :-> 0, Status :-> True} > -- apply a function: > over value (+1) row1 {Name :-> "Joe", Value :-> 11.1, Indic :-> 0, Status :-> True} > > > -- - #~ Now let's make a frame ~# - -- > let fr = toFrame [row1, row2, row3] > :t fr fr :: Frame (Record '[Name, Value, Indic, Status]) > > -- have a look thanks to our Show instance > fr {Name :-> "Joe", Value :-> 10.1, Indic :-> 0, Status :-> True} {Name :-> "Bill", Value :-> 9.0, Indic :-> 1, Status :-> True} ... Frame with 3 rows. > > -- how many rows? > frameLength fr 3 > > -- take row 0 > frameRow fr 0 == row1 True > > -- we can convert the frame to a list > :t F.toList fr F.toList fr :: [Record '[Name, Value, Indic, Status]] > > -- look at the Name column > view name <$> fr "Joe" "Bill" ... Frame with 3 rows. > -- this is a frame > :t view name <$> fr view name <$> fr :: Frame String > -- the same as > fmap (rget name) fr "Joe" "Bill" ... Frame with 3 rows. > -- as well as > rget name <$> fr "Joe" "Bill" ... Frame with 3 rows. > > -- set every Status to False > rput status False <$> fr {Name :-> "Joe", Value :-> 10.1, Indic :-> 0, Status :-> False} {Name :-> "Bill", Value :-> 9.0, Indic :-> 1, Status :-> False} ... Frame with 3 rows. > > -- double every Value > over value (*2) <$> fr {Name :-> "Joe", Value :-> 20.2, Indic :-> 0, Status :-> True} {Name :-> "Bill", Value :-> 18.0, Indic :-> 1, Status :-> True} ... Frame with 3 rows. > > -- get the minimal Value > L.fold L.minimum (rget value <$> fr) Just (-3.0) > -- or > minimum $ F.toList (rget value <$> fr) -3.0
Note that our imports of the functions
view
andset
of theLens.Family
module were not necessary because we can userget
andrput
instead (imported from theFrames.RecLens
module).The type
Frame a
is a functor ofa
’s, wherea
is the type of the rows. This explains why we can usefmap (rget name) fr
in the above code, becauserget name
is a function acting on the rows.Let’s go back to the
Show
instance we defined:instance (Show a) => Show (Frame a) where show (Frame l f) = ...
Frame a
is a data type with two constructors: an integer - its length -, and a functionInt -> a
- the function returning the row at a given index. They are respectively denoted byl
andf
in ourShow
instance.CSV import
The
Frames
library allows to get a dataframe from a CSV file. The import (in theInit
module) generates new types of records for the rows, and lenses as before.The CSV file we use contains the following small subset of the well-know iris dataset.
Id Petal.Length Petal.Width Species 1 1.4 0.2 setosa 2 1.4 0.2 setosa 3 1.3 0.2 setosa 4 3.9 1.4 versicolor 5 3.5 1.0 versicolor 6 4.2 1.5 versicolor 7 5.4 2.3 virginica 8 5.1 1.8 virginica > :load Frames_init.hs > > -- - #~ tableTypes results ~# - -- > > -- new types: Id, PetalLength, PetalWidth, and Species > :info PetalLength type PetalLength = "Petal.Length" :-> Double -- Defined at Frames_init.hs:38:1 > > -- lenses: id, petalLength, petalWidth, and species > :t species species :: (Functor f0, RElem Species rs0 (Data.Vinyl.TypeLevel.RIndex Species rs0)) => (Text -> f0 Text) -> Record rs0 -> f0 (Record rs0) > > -- Iris: an alias for the Record type of the rows > :info Iris type Iris = Record '["Id" :-> Int, "Petal.Length" :-> Double, "Petal.Width" :-> Double, "Species" :-> Text] -- Defined at Frames_init.hs:38:1 > > > -- - #~ the iris frame ~# - -- > > iris <- loadIris > iris {Id :-> 1, Petal.Length :-> 1.4, Petal.Width :-> 0.2, Species :-> "setosa"} {Id :-> 2, Petal.Length :-> 1.4, Petal.Width :-> 0.2, Species :-> "setosa"} ... Frame with 8 rows. > :t iris iris :: Frame Iris
Thus,
iris
is a frame of typeFrame Iris
, whereIris
is a synonym of theRecord
type of the rows. We will manipulateiris
in the next section.Subset, mutate, filter, aggregate
Here we give some examples of common operations on dataframes. We already saw how to transform a column in the case when the transformation does not change the type: this is achieved with the help of
over
. In order to add a new column to a frame, its type must be defined before.This is why the types
Ratio
andMeanWidth
used in the script below were defined in theInit
module.> :load Frames_init.hs > iris <- loadIris > > > -- - #~ subset of rows ~# - -- > > toFrame $ map (frameRow iris) [3, 4, 5] {Id :-> 4, Petal.Length :-> 3.9, Petal.Width :-> 1.4, Species :-> "versicolor"} {Id :-> 5, Petal.Length :-> 3.5, Petal.Width :-> 1.0, Species :-> "versicolor"} ... Frame with 3 rows. > > -- - #~ subset of columns ~# - -- > -- select Id and Species of row 3 > :set -XQuasiQuotes > :set -XDataKinds > select [pr|Id,Species|] $ frameRow iris 3 {Id :-> 4, Species :-> "versicolor"} > -- select Id and Species of the frame > select [pr|Id,Species|] <$> iris {Id :-> 1, Species :-> "setosa"} {Id :-> 2, Species :-> "setosa"} ... Frame with 8 rows. > > > -- - #~ add a new column ~# - -- > > -- say we want the ratio PetalLength/PetalWidth in a new column > -- first define a function for one record > :set -XDataKinds > :{ > let ratio :: Iris -> Record '[Id, PetalLength, PetalWidth, Species, Ratio] > ratio x = rappend x ( (rget petalLength x)/(rget petalWidth x) &: Nil ) > :} > -- test the ratio function on a row > ratio $ frameRow iris 0 {Id :-> 1, Petal.Length :-> 1.4, Petal.Width :-> 0.2, Species :-> "setosa", Ratio :-> 6.999999999999999} > -- map it to the frame > fmap ratio iris {Id :-> 1, Petal.Length :-> 1.4, Petal.Width :-> 0.2, Species :-> "setosa", Ratio :-> 6.999999999999999} {Id :-> 2, Petal.Length :-> 1.4, Petal.Width :-> 0.2, Species :-> "setosa", Ratio :-> 6.999999999999999} ... Frame with 8 rows. > > > -- - #~ filtering ~# - -- > > -- Species is Text so we set OverloadedStrings > :set -XOverloadedStrings > -- function returning True if 'kind' matches the species of 'record' > :{ > let isThisSpecies :: Text -> Iris -> Bool > isThisSpecies kind record = (rget species record) == kind > :} > -- test our function on a row > isThisSpecies "virginica" $ frameRow iris 0 False > -- filter > filterFrame (isThisSpecies "virginica") iris {Id :-> 7, Petal.Length :-> 5.4, Petal.Width :-> 2.3, Species :-> "virginica"} {Id :-> 8, Petal.Length :-> 5.1, Petal.Width :-> 1.8, Species :-> "virginica"} Frame with 2 rows. > > > -- - #~ aggregation ~# - -- > > -- say we want the mean of PetalWidth for each species > -- first write a function aggregating for a given species > import Numeric.Statistics (mean) > :{ > let recordMeanWidthBySpecies :: Frame Iris -> Text -> Record '[Species, MeanWidth] > recordMeanWidthBySpecies frame kind = kind &: average &: Nil > where average = mean . F.toList > $ (rget petalWidth <$> filterFrame (isThisSpecies kind) frame) > :} > -- test on one species > recordMeanWidthBySpecies iris "versicolor" {Species :-> "versicolor", MeanWidth :-> 1.3} > -- now map this function to all species, and frame > import Data.List (nub) > let allspecies = nub . F.toList $ rget species <$> iris > toFrame $ map (recordMeanWidthBySpecies iris) allspecies {Species :-> "setosa", MeanWidth :-> 0.2} {Species :-> "versicolor", MeanWidth :-> 1.3} ... Frame with 3 rows. > > -- put things together > :{ > let meanWidthAggregationBySpecies :: Frame Iris -> Frame (Record '[Species, MeanWidth]) > meanWidthAggregationBySpecies frame = toFrame $ map (recordMeanWidthBySpecies frame) allspecies > where allspecies = nub . F.toList $ rget species <$> frame > recordMeanWidthBySpecies fr kind = kind &: average &: Nil > where average = mean . F.toList > $ (rget petalWidth <$> filterFrame (isThisSpecies kind) fr) > :} > meanWidthAggregationBySpecies iris {Species :-> "setosa", MeanWidth :-> 0.2} {Species :-> "versicolor", MeanWidth :-> 1.3} ... Frame with 3 rows.