R example: log tokenizer
The LogTokenizer transform function reads a varchar from a table, a log message.
The LogTokenizer
transform function reads a varchar from a table, a log message. It then tokenizes each of the log messages, returning each of the tokens.
You can find more UDx examples in the Vertica Github repository, https://github.com/vertica/UDx-Examples.
Load the function and library
Create the library and the function.
=> CREATE OR REPLACE LIBRARY rLib AS 'log_tokenizer.R' LANGUAGE 'R';
CREATE LIBRARY
=> CREATE OR REPLACE TRANSFORM FUNCTION LogTokenizer AS LANGUAGE 'R' NAME 'LogTokenizerFactory' LIBRARY rLib FENCED;
CREATE FUNCTION
Querying data with the function
The following query shows how you can run a query with the UDTF.
=> SELECT machine,
LogTokenizer(error_log USING PARAMETERS spliton = ' ') OVER(PARTITION BY machine)
FROM error_logs;
machine | Token
---------+---------
node001 | ERROR
node001 | 345
node001 | -
node001 | Broken
node001 | pipe
node001 | WARN
node001 | -
node001 | Nearly
node001 | filled
node001 | disk
node002 | ERROR
node002 | 111
node002 | -
node002 | Flooded
node002 | roads
node003 | ERROR
node003 | 222
node003 | -
node003 | Plain
node003 | old
node003 | broken
(21 rows)
UDTF R code
LogTokenizer <- function(input.data.frame, parameters.data.frame) {
# Take the spliton parameter passed by the user and assign it to a variable
# in the function so we can use that as our tokenizer.
if ( is.null(parameters.data.frame[['spliton']]) ) {
stop("NULL value for spliton! Token cannot be NULL.")
} else {
split.on <- as.character(parameters.data.frame[['spliton']])
}
# Tokenize the string.
tokens <- vector(length=0)
for ( string in input.data.frame[, 1] ) {
tokenized.string <- strsplit(string, split.on)
for ( token in tokenized.string ) {
tokens <- append(tokens, token)
}
}
final.output <- data.frame(tokens)
return(final.output)
}
LogTokenizerFactory <- function() {
list(name = LogTokenizer,
udxtype = c("transform"),
intype = c("varchar"),
outtype = c("varchar"),
outtypecallback=LogTokenizerReturn,
parametertypecallback=LogTokenizerParameters)
}
LogTokenizerParameters <- function() {
parameters <- list(datatype = c("varchar"),
length = c("NA"),
scale = c("NA"),
name = c("spliton"))
return(parameters)
}
LogTokenizerReturn <- function(arg.data.frame, parm.data.frame) {
output.return.type <- data.frame(datatype = rep(NA,1),
length = rep(NA,1),
scale = rep(NA,1),
name = rep(NA,1))
output.return.type$datatype <- c("varchar")
output.return.type$name <- c("Token")
return(output.return.type)
}