R 示例:日志分词器
LogTokenizer
转换函数会从表中读取可变长字符串,即日志消息。然后,它会标记每条日志消息的字符串,以返回每个标记。
您可以在 Vertica Github 存储库中找到更多 UDx 示例:https://github.com/vertica/UDx-Examples。
加载函数和库
创建库和函数。
=> CREATE OR REPLACE LIBRARY rLib AS 'log_tokenizer.R' LANGUAGE 'R';
CREATE LIBRARY
=> CREATE OR REPLACE TRANSFORM FUNCTION LogTokenizer AS LANGUAGE 'R' NAME 'LogTokenizerFactory' LIBRARY rLib FENCED;
CREATE FUNCTION
使用函数查询数据
以下查询显示了如何使用 UDTF 运行查询。
=> SELECT machine,
LogTokenizer(error_log USING PARAMETERS spliton = ' ') OVER(PARTITION BY machine)
FROM error_logs;
machine | Token
---------+---------
node001 | ERROR
node001 | 345
node001 | -
node001 | Broken
node001 | pipe
node001 | WARN
node001 | -
node001 | Nearly
node001 | filled
node001 | disk
node002 | ERROR
node002 | 111
node002 | -
node002 | Flooded
node002 | roads
node003 | ERROR
node003 | 222
node003 | -
node003 | Plain
node003 | old
node003 | broken
(21 rows)
UDTF R 代码
LogTokenizer <- function(input.data.frame, parameters.data.frame) {
# Take the spliton parameter passed by the user and assign it to a variable
# in the function so we can use that as our tokenizer.
if ( is.null(parameters.data.frame[['spliton']]) ) {
stop("NULL value for spliton! Token cannot be NULL.")
} else {
split.on <- as.character(parameters.data.frame[['spliton']])
}
# Tokenize the string.
tokens <- vector(length=0)
for ( string in input.data.frame[, 1] ) {
tokenized.string <- strsplit(string, split.on)
for ( token in tokenized.string ) {
tokens <- append(tokens, token)
}
}
final.output <- data.frame(tokens)
return(final.output)
}
LogTokenizerFactory <- function() {
list(name = LogTokenizer,
udxtype = c("transform"),
intype = c("varchar"),
outtype = c("varchar"),
outtypecallback=LogTokenizerReturn,
parametertypecallback=LogTokenizerParameters)
}
LogTokenizerParameters <- function() {
parameters <- list(datatype = c("varchar"),
length = c("NA"),
scale = c("NA"),
name = c("spliton"))
return(parameters)
}
LogTokenizerReturn <- function(arg.data.frame, parm.data.frame) {
output.return.type <- data.frame(datatype = rep(NA,1),
length = rep(NA,1),
scale = rep(NA,1),
name = rep(NA,1))
output.return.type$datatype <- c("varchar")
output.return.type$name <- c("Token")
return(output.return.type)
}