{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RecordWildCards #-} module LightGBM.DataSet ( -- * Data Handling DataSet (..) , HasHeader(..) , fromCSV , fromFrame , toCSV , toFrame) where import Data.ByteString (ByteString) import qualified Data.ByteString.Lazy as BSL import qualified Data.Csv as CSV import qualified Data.Text as T import qualified Data.Vector as V import qualified Data.Vinyl.Functor as Vinyl (Identity) import qualified Data.Vinyl.TypeLevel as Vinyl (RecAll) import qualified Frames as F import Frames.CSV ( ParserOptions(..) , ReadRec , defaultParser , readTable , readTableOpt , writeCSV ) import Frames.InCore (RecVec) import System.Directory (copyFile) -- N.B. Right now it's just a data file, but we can add better types -- (e.g. some sort of dataframe) as other options as we move forward. -- | A set of data to use for training or prediction. -- -- data DataSet = CSVFile { dataPath :: FilePath , hasHeader :: HasHeader } deriving (Eq, Show) -- | Describes whether a CSV data file has a header row or not. newtype HasHeader = HasHeader { getHeader :: Bool } deriving (Eq, Show) -- | Load data from a file. -- -- LightGBM can read data from CSV or TSV files (or from LibSVM -- formatted files). -- -- Note that the LightGBM data file format traditionally consists of -- putting the output (aka the /labels/) in the first column, and the -- inputs (aka the /features/) in the subsequent columns. However, -- you can instruct LightGBM to -- -- * use some other column for the labels with the 'P.LabelColumn' parameter, and -- * ignore some of the feature columns with the 'P.IgnoreColumns' parameter. fromCSV :: HasHeader -> FilePath -> DataSet fromCSV = flip CSVFile -- | Load data from a 'F.Frame' into a 'DataSet' -- -- Note that this function causes the creation of a file, and it is up -- to the caller to control the lifetime of this file. This function -- is typically called in a 'Control.Exception.bracket' or a similar -- facility. For example: -- -- > withSystemTempFile "inputFrame" $ \ inputFile inputHandle -> do -- > hClose trainHandle -- > dataset <- fromFrame inFrame inputFile -- -- where 'inFrame' is the input 'F.Frame'. fromFrame :: ( F.ColumnHeaders ts , F.AsVinyl ts , Foldable f , Vinyl.RecAll Vinyl.Identity (F.UnColumn ts) Show ) => f (F.Record ts) -> FilePath -> IO DataSet fromFrame dframe fname = do _ <- writeCSV fname dframe return $ fromCSV (HasHeader True) fname -- | Write a 'DataSet' out to a CSV file. toCSV :: FilePath -- ^ Output path -> DataSet -- ^ The data to persist -> IO () toCSV outPath CSVFile {..} = copyFile dataPath outPath -- | Convert a 'DataSet' out to a 'F.Frame'. -- -- If the 'DataSet' doesn't have headers, then 'F.Frame' headers are -- generated with names 'column_i' where 'i' is the index of the -- column in question (starting at 0). -- -- Note that this function is polymorphic in the row type - the caller -- will have to define that explicitly or in context. (See the -- doctest below for a simplistic example.) -- -- >>> :set -XTypeOperators -- >>> :set -XDataKinds -- >>> import Frames ((:->)) -- >>> import qualified Frames as F -- >>> import System.IO (hPutStrLn, hClose) -- >>> import System.IO.Temp as TMP -- >>> :{ -- TMP.withSystemTempFile "toFrameTest" $ \ filepath handle -> do -- hPutStrLn handle "results\n1\n2\n3\n4\n5" -- hClose handle -- let ds = fromCSV (HasHeader True) filepath -- dsf <- toFrame ds :: IO (F.Frame (F.Record '["results" :-> Int])) -- return $ length dsf -- :} -- 5 -- -- >>> :{ -- TMP.withSystemTempFile "toFrameTest" $ \ filepath handle -> do -- hPutStrLn handle "1\n2\n3\n4" -- hClose handle -- let ds = fromCSV (HasHeader False) filepath -- dsf <- toFrame ds :: IO (F.Frame (F.Record '["column_0" :-> Int])) -- return $ length dsf -- :} -- 4 toFrame :: (RecVec rs, ReadRec rs) => DataSet -> IO (F.FrameRec rs) toFrame CSVFile {..} = case hasHeader of HasHeader True -> F.inCoreAoS $ readTable dataPath HasHeader False -> do opts <- parseOpts F.inCoreAoS $ readTableOpt opts dataPath where parseOpts :: IO ParserOptions parseOpts = do colNum <- colCount dataPath return defaultParser { headerOverride = Just [T.pack ("column_" ++ show i) | i <- [0 .. (colNum - 1)]] } colCount :: FilePath -> IO Int colCount csvfile = do csvdata <- BSL.readFile csvfile let foo = CSV.decode CSV.NoHeader csvdata :: Either String (V.Vector (V.Vector ByteString)) case foo of Left err -> error $ "Failed to get CSV column count for conversion to Frame:" ++ err Right stuff -> return . V.length . V.head $ stuff