176 lines
5.2 KiB
Lua
176 lines
5.2 KiB
Lua
--_DEBUG=true
|
|
data = require 'pl.data'
|
|
List = require 'pl.List'
|
|
array = require 'pl.array2d'
|
|
seq = require 'pl.seq'
|
|
utils = require 'pl.utils'
|
|
stringio = require 'pl.stringio'
|
|
open = stringio. open
|
|
asserteq = require 'pl.test' . asserteq
|
|
T = require 'pl.test'. tuple
|
|
|
|
-- tab-separated data, explicit column names
|
|
t1f = open [[
|
|
EventID Magnitude LocationX LocationY LocationZ LocationError EventDate DataFile
|
|
981124001 2.0 18988.4 10047.1 4149.7 33.8 24/11/1998 11:18:05 981124DF.AAB
|
|
981125001 0.8 19104.0 9970.4 5088.7 3.0 25/11/1998 05:44:54 981125DF.AAB
|
|
981127003 0.5 19012.5 9946.9 3831.2 46.0 27/11/1998 17:15:17 981127DF.AAD
|
|
981127005 0.6 18676.4 10606.2 3761.9 4.4 27/11/1998 17:46:36 981127DF.AAF
|
|
981127006 0.2 19109.9 9716.5 3612.0 11.8 27/11/1998 19:29:51 981127DF.AAG
|
|
]]
|
|
|
|
t1 = data.read (t1f)
|
|
-- column_by_name returns a List
|
|
asserteq(t1:column_by_name 'Magnitude',List{2,0.8,0.5,0.6,0.2})
|
|
-- can use array.column as well
|
|
asserteq(array.column(t1,2),{2,0.8,0.5,0.6,0.2})
|
|
|
|
-- only numerical columns (deduced from first data row) are converted by default
|
|
-- can look up indices in the list fieldnames.
|
|
EDI = t1.fieldnames:index 'EventDate'
|
|
assert(type(t1[1][EDI]) == 'string')
|
|
|
|
-- select method returns a sequence, in this case single-valued.
|
|
-- (Note that seq.copy returns a List)
|
|
asserteq(seq(t1:select 'LocationX where Magnitude > 0.5'):copy(),List{18988.4,19104,18676.4})
|
|
|
|
--[[
|
|
--a common select usage pattern:
|
|
for event,mag in t1:select 'EventID,Magnitude sort by Magnitude desc' do
|
|
print(event,mag)
|
|
end
|
|
--]]
|
|
|
|
-- space-separated, but with last field containing spaces.
|
|
t2f = open [[
|
|
USER PID %MEM %CPU COMMAND
|
|
sdonovan 2333 0.3 0.1 background --n=2
|
|
root 2332 0.4 0.2 fred --start=yes
|
|
root 2338 0.2 0.1 backyard-process
|
|
]]
|
|
|
|
t2,err = data.read(t2f,{last_field_collect=true})
|
|
if not t2 then return print (err) end
|
|
|
|
-- the last_field_collect option is useful with space-delimited data where the last
|
|
-- field may contain spaces. Otherwise, a record count mismatch should be an error!
|
|
lt2 = List(t2[2])
|
|
asserteq(lt2:join ',','root,2332,0.4,0.2,fred --start=yes')
|
|
|
|
-- fieldnames are converted into valid identifiers by substituting _
|
|
-- (we do this to make select queries parseable by Lua)
|
|
asserteq(t2.fieldnames,List{'USER','PID','_MEM','_CPU','COMMAND'})
|
|
|
|
-- select queries are NOT SQL so remember to use == ! (and no 'between' operator, sorry)
|
|
--s,err = t2:select('_MEM where USER="root"')
|
|
--assert(err == [[[string "tmp"]:9: unexpected symbol near '=']])
|
|
|
|
s = t2:select('_MEM where USER=="root"')
|
|
assert(s() == 0.4)
|
|
assert(s() == 0.2)
|
|
assert(s() == nil)
|
|
|
|
-- CSV, Excel style
|
|
t3f = open [[
|
|
Department Name,Employee ID,Project,Hours Booked
|
|
sales,1231,overhead,4
|
|
sales,1255,overhead,3
|
|
engineering,1501,development,5
|
|
engineering,1501,maintenance,3
|
|
engineering,1433,maintenance,10
|
|
]]
|
|
|
|
t3 = data.read(t3f)
|
|
|
|
-- a common operation is to select using a given list of columns, and each row
|
|
-- on some explicit condition. The select() method can take a table with these
|
|
-- parameters
|
|
keepcols = {'Employee_ID','Hours_Booked'}
|
|
|
|
q = t3:select { fields = keepcols,
|
|
where = function(row) return row[1]=='engineering' end
|
|
}
|
|
|
|
asserteq(seq.copy2(q),{{1501,5},{1501,3},{1433,10}})
|
|
|
|
-- another pattern is doing a select to restrict rows & columns, process some
|
|
-- fields and write out the modified rows.
|
|
|
|
utils.import 'pl.func'
|
|
|
|
outf = stringio.create()
|
|
|
|
names = {[1501]='don',[1433]='dilbert'}
|
|
|
|
t3:write_row (outf,{'Employee','Hours_Booked'})
|
|
q = t3:select_row {fields=keepcols,where=Eq(_1[1],'engineering')}
|
|
for row in q do
|
|
row[1] = names[row[1]]
|
|
t3:write_row(outf,row)
|
|
end
|
|
|
|
asserteq(outf:value(),
|
|
[[
|
|
Employee,Hours_Booked
|
|
don,5
|
|
don,3
|
|
dilbert,10
|
|
]])
|
|
|
|
-- data may not always have column headers. When creating a data object
|
|
-- from a two-dimensional array, must specify the fieldnames, as a list or a string.
|
|
-- The delimiter is deduced from the fieldname string, so a string just containing
|
|
-- the delimiter will set it, and the fieldnames will be empty.
|
|
local dat = List()
|
|
local row = List.range(1,10)
|
|
for i = 1,10 do
|
|
dat:append(row:map('*',i))
|
|
end
|
|
dat = data.new(dat,',')
|
|
local out = stringio.create()
|
|
dat:write(out,',')
|
|
asserteq(out:value(), [[
|
|
1,2,3,4,5,6,7,8,9,10
|
|
2,4,6,8,10,12,14,16,18,20
|
|
3,6,9,12,15,18,21,24,27,30
|
|
4,8,12,16,20,24,28,32,36,40
|
|
5,10,15,20,25,30,35,40,45,50
|
|
6,12,18,24,30,36,42,48,54,60
|
|
7,14,21,28,35,42,49,56,63,70
|
|
8,16,24,32,40,48,56,64,72,80
|
|
9,18,27,36,45,54,63,72,81,90
|
|
10,20,30,40,50,60,70,80,90,100
|
|
]])
|
|
|
|
-- you can always use numerical field indices, AWK-style;
|
|
-- note how the copy_select method gives you a data object instead of an
|
|
-- iterator over the fields
|
|
local res = dat:copy_select '$1,$3 where $1 > 5'
|
|
local L = List
|
|
asserteq(L(res),L{
|
|
L{6, 18},
|
|
L{7,21},
|
|
L{8,24},
|
|
L{9,27},
|
|
L{10,30},
|
|
})
|
|
|
|
-- the column_by_name method may take a fieldname or an index
|
|
asserteq(dat:column_by_name(2), L{2,4,6,8,10,12,14,16,18,20})
|
|
|
|
-- the field list may contain expressions or even constants
|
|
local q = dat:select '$3,2*$4 where $1 == 8'
|
|
asserteq(T(q()),T(24,64))
|
|
|
|
dat = data.read(open [[
|
|
1.0 0.1
|
|
0.2 1.3
|
|
]])
|
|
|
|
-- if a method cannot be found, then we look up in array2d
|
|
-- array2d.flatten(t) makes a 1D list out of a 2D array,
|
|
-- and then List.minmax() gets the extrema.
|
|
|
|
asserteq(T(dat:flatten():minmax()),T(0.1,1.3))
|
|
|