java-udl-distribute-data-files-py.md
#!/usr/bin/python
# Save this file as UDLDataGen.py
import string
import random
import sys
import os
# Read in the dictionary file to provide random words. Assumes the words
# file is located in /usr/share/dict/words
wordFile = open("/usr/share/dict/words")
wordDict = []
for line in wordFile:
if len(line) > 6:
wordDict.append(line.strip())
MAXSTR = 4 # Maximum number of words to concatentate
NUMROWS = 1000 # Number of rows of data to generate
#FILEPATH = '/tmp/UDLdata.txt' # Final filename to use for UDL source
TMPFILE = '/tmp/UDLtemp.txt' # Temporary filename.
# Generate a random string by concatenating several words together. Max
# number of words set by MAXSTR
def randomWords():
words = [random.choice(wordDict) for n in xrange(random.randint(1, MAXSTR))]
sentence = " ".join(words)
return sentence
# Create a temporary data file that will be moved to a node. Number of
# rows for the file is set by NUMROWS. Adds the name of the node which will
# get the file, to show which node loaded the data.
def generateFile(node):
outFile = open(TMPFILE, 'w')
for line in xrange(NUMROWS):
outFile.write('{0}|{1}|{2}\n'.format(line,randomWords(),node))
outFile.close()
# Copy the temporary file to a node. Only works if passwordless SSH login
# is enabled, which it is for the database administrator account on
# Vertica hosts.
def copyFile(fileName,node):
os.system('scp "%s" "%s:%s"' % (TMPFILE, node, fileName) )
# Loop through the comma-separated list of nodes given in the first
# parameter, creating and copying data files whose full comma-separated
# paths are passed in the second parameter
for node in [x.strip() for x in sys.argv[1].split(',')]:
for fileName in [y.strip() for y in sys.argv[2].split(',')]:
print "generating file", fileName, "for", node
generateFile(node)
print "Copying file to",node
copyFile(fileName,node)