Since I write software for a living (and for a hobby), I thought it would be interesting to write a quick Python program to see how many random words I could generate that were valid words. In this post, I’m not drawing any conclusions or extrapolations from the data, rather just reporting the results. The key to making this work was to find a way to determine if the words I was generating were valid or not. So, I decided to bounce the words off of the Merriam-Webster dictionary online. After looking at how they construct the URL, I figured out how I could do it. I have provided the source code here as a gist:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import urllib, random, time, datetime, sys | |
def getTime(): | |
return time.asctime(time.localtime(time.time())) | |
def getFromUrl(url): | |
while 1: | |
try: | |
webpage = urllib.urlopen(url) | |
page = webpage.read() | |
#print "[" + getTime() + "] Page successfully read." | |
return page | |
except: | |
randSleep = random.randint(1, 20) | |
print "[" + getTime() + "] Error occurred while processing web page. " + \ | |
"Trying again after sleeping " + str(randSleep) + " seconds…", \ | |
sys.exc_type, sys.exc_info() | |
time.sleep(randSleep) | |
return None | |
letters = [ | |
"A", | |
"B", | |
"C", | |
"D", | |
"E", | |
"F", | |
"G", | |
"H", | |
"I", | |
"J", | |
"K", | |
"L", | |
"M", | |
"N", | |
"O", | |
"P", | |
"Q", | |
"R", | |
"S", | |
"T", | |
"U", | |
"V", | |
"W", | |
"X", | |
"Y", | |
"Z" | |
] | |
validCount = 0 | |
invalidCount = 0 | |
validWordsGenerated = [] | |
timeStartedMillis = int(round(time.time() * 1000)) | |
now = datetime.datetime.now() | |
countByNumberOfCharacters = {} | |
countByNumberOfCharacters[3] = 0 | |
countByNumberOfCharacters[4] = 0 | |
countByNumberOfCharacters[5] = 0 | |
countByNumberOfCharacters[6] = 0 | |
countByNumberOfCharacters[7] = 0 | |
countByNumberOfCharacters[8] = 0 | |
countByNumberOfCharacters[9] = 0 | |
countByNumberOfCharacters[10] = 0 | |
countByNumberOfCharacters[11] = 0 | |
countByNumberOfCharacters[12] = 0 | |
while True: | |
sizeOfWord = random.randint(3,12) | |
word = "" | |
for x in range(sizeOfWord): | |
word += letters[random.randint(0, len(letters) – 1)] | |
if word in validWordsGenerated: | |
continue | |
# check to see if there are 4 or more consonants in a row | |
tooManyConsonants = False | |
consecutiveConsonantCount = 0 | |
for ch in word: | |
if ch in ['A','E','I','O','U']: | |
# we have a vowel | |
consecutiveConsonantCount = 0 | |
else: | |
# we have a consonant | |
consecutiveConsonantCount += 1 | |
if consecutiveConsonantCount >= 4: | |
# too many consonants – throw this one out… | |
tooManyConsonants = True | |
break | |
f = open("/cygdrive/c/backup/important_docs/randomWords.log", "a") | |
if tooManyConsonants: | |
invalidCount += 1 | |
print >>f, "Word '" + word + "' had at least 4 consonants in a row and will be ignored…" | |
continue | |
uri = "http://www.merriam-webster.com/dictionary/" + word | |
lines = getFromUrl(uri) | |
if lines is None: | |
continue | |
validWord = False | |
abbrev = False | |
if "abbr</em>" in lines: | |
validWord = False | |
abbrev = True | |
elif "<em>abbreviation</em>" in lines: | |
validWord = False | |
abbrev = True | |
elif "<h2>" + word.upper() + "</h2> <span class=\"main-fl\"><em>abbreviation</em>" in lines or ("abbr</em>" in lines and "<h2>" + word.upper() + "</h2>" in lines): | |
validWord = False | |
abbrev = True | |
elif "<h2>" + word.lower() + "</h2> <span class=\"main-fl\"><em>abbreviation</em>" in lines or ("abbr</em>" in lines and "<h2>" + word.lower() + "</h2>" in lines): | |
validWord = False | |
abbrev = True | |
elif "ENTRIES FOUND" in lines or "Definition of <em>" + word.upper() + "</em>" in lines: | |
validWord = True | |
if validWord: | |
print >>f, "'" + word + "' is a valid word" | |
validCount += 1 | |
validWordsGenerated.append(word) | |
wordLength = len(word) | |
countByNumberOfCharacters[wordLength] = countByNumberOfCharacters[wordLength] + 1 | |
else: | |
if abbrev: | |
print >>f, "'" + word + "' is an ABBREVIATION" | |
else: | |
print >>f, "'" + word + "' is NOT a valid word" | |
invalidCount += 1 | |
percentOfTotal = (float(validCount) / float(validCount + invalidCount)) * 100 | |
formattedPercentOfTotal = "%.2f" % percentOfTotal | |
invalidpercentOfTotal = (float(invalidCount) / float(validCount + invalidCount)) * 100 | |
formattedInvalidPercentOfTotal = "%.2f" % invalidpercentOfTotal | |
print >>f, str(validCount + invalidCount) + " words generated: " + str(validCount) + " valid (" + formattedPercentOfTotal + "%) and " + str(invalidCount) + " invalid (" + formattedInvalidPercentOfTotal + "%)" | |
currentTimeMillis = int(round(time.time() * 1000)) | |
rtMinutes = (float(currentTimeMillis – timeStartedMillis) / float(1000)) / float(60) | |
print >>f, "Running time " + str("%.2f" % rtMinutes) + " minutes. Started at " + now.strftime("%m-%d-%Y %H:%M:%S") | |
if len(validWordsGenerated) > 0: | |
print >>f, "" | |
print >>f, "Valid words generated so far: " + str(validWordsGenerated) | |
print >>f, "" | |
keys = countByNumberOfCharacters.keys() | |
keys.sort() | |
printed = False | |
for countOfChars in keys: | |
if countByNumberOfCharacters[countOfChars] > 0: | |
print >>f, "\t" + str(countOfChars) + " characters " + str(countByNumberOfCharacters[countOfChars]) + " valid words have been generated" | |
printed = True | |
if printed: | |
print >>f, "" | |
randomSleepTime = random.randint(3, 8) | |
print >>f, "Sleeping for " + str(randomSleepTime) + " seconds…" | |
time.sleep(randomSleepTime) | |
f.close() |
The code essentially uses the letters of the alphabet to try to construct random words in the range of 3 to 12 characters long. Immediately discarded are any randomly generated words with 4 or more consonants in a row OR any words that already exist in the list of words already found to be valid. This saves the app from having to make another web page read when I can determine ahead of time that the word is not valid. Another feature of the program is that it sleeps for a random range of 3 to 8 seconds between each call to Merriam Webster, that way I’m not hammering their server, but rather am behaving more like a real user of the site with “think time” built in.
I ran the program 2 separate times for a total running time of 3.115 days (almost 75 hours – exactly 4484.91 minutes). During these 2 runs, I generated a total of 123,459 words. 640 of the generated words (0.52%) were found to be valid words according to the Merriam Webster dictionary. Of those 640 valid words, 535 of them were 3 characters, 97 of them were 4 characters, 7 of them were 5 characters and there was 1 valid 6 character word generated. The script is supposed to eliminate abbreviations, because there is a way to programmatically detect that Merriam Webster is reporting the word as an abbreviation. However, as I look at the words determined to be valid, many of them appear either to be abbreviations, acronyms or otherwise unrecognizable. Here is the list of valid words generated, I will let you make your own decision. Here are the 226 “valid” words generated from run 1:
‘HLA’, ‘UPFOR’, ‘BUN’, ‘CHI’, ‘LUM’, ‘COW’, ‘BUM’, ‘SINE’, ‘ADE’, ‘TAI’, ‘TIS’, ‘CEE’, ‘SUE’, ‘PRE’, ‘SUR’, ‘PAY’, ‘FRO’, ‘APC’, ‘UGH’, ‘NOB’, ‘EOS’, ‘OEM’, ‘LETT’, ‘GAB’, ‘TAB’, ‘ZDV’, ‘BOY’, ‘CATT’, ‘DID’, ‘APL’, ‘GLOM’, ‘GON’, ‘MOON’, ‘ADO’, ‘LYO’, ‘GIA’, ‘HID’, ‘THE’, ‘WAY’, ‘FRA’, ‘OUD’, ‘JOB’, ‘LAO’, ‘IER’, ‘EAT’, ‘RIM’, ‘CORI’, ‘DRAYS’, ‘ZOO’, ‘KUN’, ‘AIX’, ‘LSD’, ‘HOD’, ‘EVE’, ‘BIZ’, ‘ELM’, ‘BUB’, ‘HSI’, ‘SLO’, ‘XTC’, ‘SANA’, ‘OHM’, ‘LAS’, ‘POOR’, ‘WAG’, ‘YON’, ‘VAV’, ‘HIT’, ‘RBI’, ‘ISM’, ‘PEU’, ‘XML’, ‘POI’, ‘SEW’, ‘ZUG’, ‘LOU’, ‘SIV’, ‘JET’, ‘AHI’, ‘GAT’, ‘RSS’, ‘PLY’, ‘RPV’, ‘COZ’, ‘MUD’, ‘DOW’, ‘SUI’, ‘WAC’, ‘MIA’, ‘YUK’, ‘SHM’, ‘HAWK’, ‘DAX’, ‘FIX’, ‘ACL’, ‘DIT’, ‘TEE’, ‘BEY’, ‘DANTE’, ‘WIG’, ‘SET’, ‘PAZ’, ‘VOW’, ‘TIC’, ‘MCO’, ‘GNAT’, ‘GLOW’, ‘VOG’, ‘MEGA’, ‘SOS’, ‘MAB’, ‘FTP’, ‘PALY’, ‘SICK’, ‘NING’, ‘YOD’, ‘ORR’, ‘IGA’, ‘GAN’, ‘ODE’, ‘BUG’, ‘OUR’, ‘JIG’, ‘RAN’, ‘RUG’, ‘YER’, ‘KANT’, ‘ROY’, ‘KAME’, ‘LOW’, ‘HET’, ‘DULL’, ‘LOSE’, ‘HOL’, ‘FEL’, ‘PAU’, ‘FIR’, ‘NIP’, ‘HIB’, ‘CEO’, ‘PPO’, ‘GOA’, ‘MUG’, ‘SAY’, ‘GOT’, ‘MOW’, ‘ATTU’, ‘GET’, ‘AUK’, ‘SEA’, ‘FOI’, ‘ECU’, ‘PUS’, ‘TRY’, ‘VCR’, ‘OOH’, ‘PRY’, ‘IOUS’, ‘AMI’, ‘HEED’, ‘ORB’, ‘TIP’, ‘TUP’, ‘CUI’, ‘ONO’, ‘WEN’, ‘HUM’, ‘PICA’, ‘ROW’, ‘EEK’, ‘KITH’, ‘ABY’, ‘IBN’, ‘PUT’, ‘HAE’, ‘HUN’, ‘DII’, ‘YIP’, ‘EAR’, ‘MHO’, ‘MUR’, ‘TRIX’, ‘FIRS’, ‘VEG’, ‘DUE’, ‘SHAD’, ‘PIS’, ‘ASH’, ‘KOO’, ‘USB’, ‘BAH’, ‘LOFT’, ‘YEA’, ‘ABLY’, ‘PDQ’, ‘BUY’, ‘AIR’, ‘ECK’, ‘IGG’, ‘FUN’, ‘HOST’, ‘UKE’, ‘JIH’, ‘END’, ‘LAG’, ‘PAD’, ‘TETH’, ‘ADZ’, ‘PAL’, ‘SIR’, ‘SAP’, ‘LELE’, ‘OAK’, ‘RETZ’, ‘CIAO’, ‘JUDE’, ‘PUL’, ‘TOUT’, ‘CUT’, ‘COWS’, ‘MIM’, ‘OVI’, ‘JIM’, ‘DDT’, ‘IUD’, ‘LAW’
In looking at this list, these 2 stuck out to me:
‘THE’, ‘WAY’
See Acts 9:2,19:9,19:23,24:14,24:22 for the biblical usage of this phrase…
and here are the 414 “valid” words generated from run 2:
‘HEE’, ‘SUR’, ‘OUR’, ‘DID’, ‘BRA’, ‘DRAB’, ‘PEE’, ‘KIN’, ‘CHU’, ‘RIB’, ‘MAT’, ‘AWE’, ‘RAJ’, ‘UVC’, ‘ASK’, ‘ALES’, ‘VRE’, ‘DPN’, ‘XML’, ‘HOY’, ‘TEE’, ‘TIU’, ‘PAW’, ‘DES’, ‘TUP’, ‘ROB’, ‘KYD’, ‘ABU’, ‘AVO’, ‘GOA’, ‘RUN’, ‘LOP’, ‘SUI’, ‘SEE’, ‘DEZ’, ‘KNUR’, ‘NIM’, ‘FEZ’, ‘BUN’, ‘MOJO’, ‘NOH’, ‘LYS’, ‘UNO’, ‘AGE’, ‘ELK’, ‘LAC’, ‘CHI’, ‘VISE’, ‘HIP’, ‘HUB’, ‘WEN’, ‘ZIG’, ‘WEI’, ‘MEW’, ‘ATE’, ‘END’, ‘LELY’, ‘TOW’, ‘GHAT’, ‘HAI’, ‘MEN’, ‘MUNRO’, ‘BAD’, ‘COX’, ‘RAT’, ‘ETH’, ‘ICS’, ‘HUE’, ‘OUD’, ‘PILED’, ‘PIG’, ‘PARD’, ‘DITZ’, ‘AIN’, ‘SALP’, ‘TSHI’, ‘FOY’, ‘SKI’, ‘PUT’, ‘IVY’, ‘ALE’, ‘HET’, ‘III’, ‘UAV’, ‘XTC’, ‘KUT’, ‘IUD’, ‘GNU’, ‘AWNS’, ‘WAX’, ‘QUA’, ‘ZOO’, ‘QOM’, ‘ULM’, ‘KEYS’, ‘WHY’, ‘JOW’, ‘WOP’, ‘LEE’, ‘CUP’, ‘ZITI’, ‘TEN’, ‘ZAP’, ‘CWM’, ‘YUK’, ‘RAG’, ‘BIO’, ‘TUX’, ‘MOP’, ‘FAN’, ‘HUG’, ‘GEL’, ‘FLU’, ‘DUNG’, ‘HIE’, ‘POI’, ‘SIC’, ‘OAK’, ‘VII’, ‘BOAS’, ‘POM’, ‘IFS’, ‘ONO’, ‘IGD’, ‘BHC’, ‘JIB’, ‘LUM’, ‘TIL’, ‘FUN’, ‘FAT’, ‘TIC’, ‘WET’, ‘SET’, ‘YID’, ‘DOL’, ‘TWA’, ‘IPO’, ‘DING’, ‘URO’, ‘OVI’, ‘SRI’, ‘KOCH’, ‘NAN’, ‘FOX’, ‘RAW’, ‘SOD’, ‘VOW’, ‘EAT’, ‘REM’, ‘RUT’, ‘LYE’, ‘ALLO’, ‘TAX’, ‘TOWS’, ‘TED’, ‘OFT’, ‘HMO’, ‘WOK’, ‘OCA’, ‘RRNA’, ‘FEE’, ‘PRE’, ‘UTE’, ‘NET’, ‘DDE’, ‘DDD’, ‘JAY’, ‘LID’, ‘ISH’, ‘FIRM’, ‘PED’, ‘FIX’, ‘LAN’, ‘PDQ’, ‘DIE’, ‘LOD’, ‘WAD’, ‘POST’, ‘WAY’, ‘WOE’, ‘ALOW’, ‘DEK’, ‘YAK’, ‘SNP’, ‘DHU’, ‘BASE’, ‘CRI’, ‘PAZ’, ‘SPY’, ‘ODE’, ‘CEE’, ‘MHO’, ‘GON’, ‘BALK’, ‘BOSC’, ‘INTI’, ‘OAF’, ‘DAX’, ‘FTP’, ‘ELL’, ‘NUT’, ‘TAU’, ‘HGE’, ‘NEO’, ‘USK’, ‘UFA’, ‘TOL’, ‘DIT’, ‘EOS’, ‘ATP’, ‘SUM’, ‘TWI’, ‘REX’, ‘UCL’, ‘SST’, ‘YALL’, ‘TAW’, ‘ABM’, ‘ANE’, ‘SIR’, ‘VERY’, ‘KAT’, ‘UPAS’, ‘PEW’, ‘HSU’, ‘GTP’, ‘TWO’, ‘MID’, ‘STY’, ‘JOY’, ‘DEE’, ‘YACK’, ‘HUM’, ‘RABI’, ‘GAUD’, ‘DUE’, ‘OAR’, ‘TAO’, ‘JAP’, ‘CRUD’, ‘YAP’, ‘KAY’, ‘EAR’, ‘YON’, ‘JAW’, ‘KOS’, ‘TOM’, ‘DUI’, ‘FOP’, ‘CHA’, ‘DUN’, ‘OUT’, ‘KOO’, ‘TAM’, ‘AWL’, ‘BHA’, ‘POX’, ‘LOW’, ‘GOT’, ‘LULL’, ‘TUB’, ‘MIM’, ‘HOW’, ‘FUG’, ‘KOI’, ‘HIB’, ‘SLO’, ‘PAPA’, ‘XER’, ‘FAG’, ‘PEA’, ‘IST’, ‘TAB’, ‘ROY’, ‘GHB’, ‘SON’, ‘LDL’, ‘JAB’, ‘LOB’, ‘BENXI’, ‘RAE’, ‘NAB’, ‘TARE’, ‘YIP’, ‘GAY’, ‘OIL’, ‘PIE’, ‘WERT’, ‘FIS’, ‘LAK’, ‘ZUG’, ‘PAL’, ‘BOK’, ‘QAT’, ‘WARN’, ‘LICK’, ‘LANK’, ‘LIKENS’, ‘HORN’, ‘MARE’, ‘DAW’, ‘PYX’, ‘ECK’, ‘FIE’, ‘TUBS’, ‘DME’, ‘TOE’, ‘REB’, ‘NCO’, ‘NOG’, ‘THOR’, ‘AYR’, ‘THO’, ‘BVD’, ‘PAR’, ‘TEPA’, ‘THY’, ‘LAO’, ‘TOD’, ‘OKA’, ‘RYE’, ‘ASS’, ‘GUY’, ‘GEE’, ‘TRI’, ‘CTL’, ‘SAN’, ‘LAH’, ‘TOP’, ‘HEP’, ‘WAS’, ‘AGO’, ‘CIG’, ‘AZA’, ‘HEM’, ‘SOW’, ‘TNT’, ‘TIRL’, ‘GOY’, ‘HOF’, ‘RNA’, ‘GAP’, ‘GOUGH’, ‘ORR’, ‘NEW’, ‘SHY’, ‘EME’, ‘URI’, ‘SAL’, ‘FRA’, ‘HAN’, ‘PEP’, ‘UGLI’, ‘HIT’, ‘ATV’, ‘HAD’, ‘BMX’, ‘PYA’, ‘BARD’, ‘PIT’, ‘RAY’, ‘PISA’, ‘RBI’, ‘GUAN’, ‘AMI’, ‘DUMP’, ‘KOA’, ‘HEN’, ‘FID’, ‘WEE’, ‘HUN’, ‘CPU’, ‘BIS’, ‘CRED’, ‘PILL’, ‘TPN’, ‘GAG’, ‘LOT’, ‘AZT’, ‘PUMP’, ‘DFP’, ‘RAX’, ‘AIX’, ‘EER’, ‘PUN’, ‘PUL’, ‘POW’, ‘JUG’, ‘FIB’, ‘NIN’, ‘THUS’, ‘ZIP’, ‘GOO’, ‘WHIZ’, ‘TYR’, ‘CEL’, ‘MAX’, ‘ALI’, ‘RIG’, ‘KIP’, ‘ZED’, ‘BOB’, ‘NEVE’, ‘FEW’, ‘ROH’, ‘ODD’, ‘UTA’, ‘ADE’, ‘ROD’, ‘GYP’, ‘IVE’, ‘CORM’, ‘IBN’, ‘VID’
I certainly welcome anyone to draw conclusions from this data.