C++ example: converting encoding
The following example shows how you can convert encoding for a file from one type to another by converting UTF-16 encoded data to UTF-8.
The following example shows how you can convert encoding for a file from one type to another by converting UTF-16 encoded data to UTF-8. You can find this example in the SDK at /opt/vertica/sdk/examples/FilterFunctions/IConverter.cpp
.
Filter implementation
class Iconverter : public UDFilter{
private:
std::string fromEncoding, toEncoding;
iconv_t cd; // the conversion descriptor opened
uint converted; // how many characters have been converted
protected:
virtual StreamState process(ServerInterface &srvInterface, DataBuffer &input,
InputState input_state, DataBuffer &output)
{
char *input_buf = (char *)input.buf + input.offset;
char *output_buf = (char *)output.buf + output.offset;
size_t inBytesLeft = input.size - input.offset, outBytesLeft = output.size - output.offset;
// end of input
if (input_state == END_OF_FILE && inBytesLeft == 0)
{
// Gnu libc iconv doc says, it is good practice to finalize the
// outbuffer for stateful encodings (by calling with null inbuffer).
//
// http://www.gnu.org/software/libc/manual/html_node/Generic-Conversion-Interface.html
iconv(cd, NULL, NULL, &output_buf, &outBytesLeft);
// output buffer can be updated by this operation
output.offset = output.size - outBytesLeft;
return DONE;
}
size_t ret = iconv(cd, &input_buf, &inBytesLeft, &output_buf, &outBytesLeft);
// if conversion is successful, we ask for more input, as input has not reached EOF.
StreamState retStatus = INPUT_NEEDED;
if (ret == (size_t)(-1))
{
// seen an error
switch (errno)
{
case E2BIG:
// input size too big, not a problem, ask for more output.
retStatus = OUTPUT_NEEDED;
break;
case EINVAL:
// input stops in the middle of a byte sequence, not a problem, ask for more input
retStatus = input_state == END_OF_FILE ? DONE : INPUT_NEEDED;
break;
case EILSEQ:
// invalid sequence seen, throw
// TODO: reporting the wrong byte position
vt_report_error(1, "Invalid byte sequence when doing %u-th conversion", converted);
case EBADF:
// something wrong with descriptor, throw
vt_report_error(0, "Invalid descriptor");
default:
vt_report_error(0, "Uncommon Error");
break;
}
}
else converted += ret;
// move position pointer
input.offset = input.size - inBytesLeft;
output.offset = output.size - outBytesLeft;
return retStatus;
}
public:
Iconverter(const std::string &from, const std::string &to)
: fromEncoding(from), toEncoding(to), converted(0)
{
// note "to encoding" is first argument to iconv...
cd = iconv_open(to.c_str(), from.c_str());
if (cd == (iconv_t)(-1))
{
// error when creating converters.
vt_report_error(0, "Error initializing iconv: %m");
}
}
~Iconverter()
{
// free iconv resources;
iconv_close(cd);
}
};
Factory implementation
class IconverterFactory : public FilterFactory{
public:
virtual void plan(ServerInterface &srvInterface,
PlanContext &planCtxt) {
std::vector<std::string> args = srvInterface.getParamReader().getParamNames();
/* Check parameters */
if (!(args.size() == 0 ||
(args.size() == 1 && find(args.begin(), args.end(), "from_encoding")
!= args.end()) || (args.size() == 2
&& find(args.begin(), args.end(), "from_encoding") != args.end()
&& find(args.begin(), args.end(), "to_encoding") != args.end()))) {
vt_report_error(0, "Invalid arguments. Must specify either no arguments, or "
"'from_encoding' alone, or 'from_encoding' and 'to_encoding'.");
}
/* Populate planData */
// By default, we do UTF16->UTF8, and x->UTF8
VString from_encoding = planCtxt.getWriter().getStringRef("from_encoding");
VString to_encoding = planCtxt.getWriter().getStringRef("to_encoding");
from_encoding.copy("UTF-16");
to_encoding.copy("UTF-8");
if (args.size() == 2)
{
from_encoding.copy(srvInterface.getParamReader().getStringRef("from_encoding"));
to_encoding.copy(srvInterface.getParamReader().getStringRef("to_encoding"));
}
else if (args.size() == 1)
{
from_encoding.copy(srvInterface.getParamReader().getStringRef("from_encoding"));
}
if (!from_encoding.length()) {
vt_report_error(0, "The empty string is not a valid from_encoding value");
}
if (!to_encoding.length()) {
vt_report_error(0, "The empty string is not a valid to_encoding value");
}
}
virtual UDFilter* prepare(ServerInterface &srvInterface,
PlanContext &planCtxt) {
return vt_createFuncObj(srvInterface.allocator, Iconverter,
planCtxt.getReader().getStringRef("from_encoding").str(),
planCtxt.getReader().getStringRef("to_encoding").str());
}
virtual void getParameterType(ServerInterface &srvInterface,
SizedColumnTypes ¶meterTypes) {
parameterTypes.addVarchar(32, "from_encoding");
parameterTypes.addVarchar(32, "to_encoding");
}
};
RegisterFactory(IconverterFactory);