Friday, November 05, 2010

Wordhacking in Python

Reddit is hiring again and number 5 of the bonus points here: http://blog.reddit.com/2010/08/reddit-is-hiring.html is:

5. ...compose us a piece of customized wordhacking, e.g.:
pithier
legible
emerald
alleged
spumoni
excerpt

That sounds like an interesting task and here we will try solve it using Python and more specifically the intersection operation of the set type. For example one solution to the text "reddit is now wordhacking" is :

thrown
daemon
coders
iodide
weighs
catnap
stitch
lusaka
bandit
scorns
sewage


And another:

runways
echoing
doormat
deadsea
itching
tetanus
icecold
soakers
namings
oddness
wriggle

Follows the source. The function that do all the work is make_wordhacking. When run for the first time the program will get a dictionary with words from Internet. The program will print only one (the first) solution for each combination of columns and word size.



from __future__ import division
from urllib2 import urlopen
from collections import defaultdict
from itertools import combinations
from zipfile import ZipFile
import os


def iterate_in_group_by(num, cx):
"iterate_in_group_by(3, 'ABCDEFG') --> ABC DEF G"
c = 0
while c < len(cx):
yield(cx[c:c+num])
c += num

def get_words():
fn = "corncob_lowercase.zip"
if not os.path.isfile(fn):
data = urlopen( "http://www.mieliestronk.com/" + fn ).read()
with open(fn,"w") as fw:
fw.write(data)
fz = ZipFile(fn)
return fz.open(fz.namelist()[0]).read().split()

def make_sets(W):
S=defaultdict(lambda: defaultdict(set))
for word in W:
for i,c in enumerate(word):
S[c][i].add(word)
return S

def factors(n):
for k in range(2,n//2+1):
if n%k == 0:
yield k


def print_reddit_comment_solution(R,P):
print "---------------------------------------------------"
print "We have a solution"
print P
for v in R.values():
for w in [list(a) for a in v]:
for p in range(len(w)):
if p in P:
w[p] = "`[%s](http://)`"%w[p]
print "".join(("\n`%s`" % "".join(w)).split("``"))
break

def print_html_solution(R,P):
print """\n<br/><div style="font-family: monospace; text-align: center; font-size: x-large; line-height: 1em;">"""
for v in R.values():
for w in [list(a) for a in v]:
for p in range(len(w)):
if p in P:
w[p] = """<span style="font-family:monospace; font-weight: bold; background-color: rgb(255, 221, 221);">%s</span>"""%w[p]
else:
w[p] = """<span style="font-family:monospace; font-weight: bold; background-color: rgb(221, 255, 255);">%s</span>""" % w[p]
print "<br/>%s" % "".join(w)
break
print """</div>"""

def make_wordhacking(text,W):
text = [x for x in text if x.isalpha()]
for colnum in factors(len(text)):
print colnum
for w_len in range(colnum,15):
S = make_sets( (word for word in W if len(word)==w_len) )
T = list(iterate_in_group_by(len(text)//colnum,text))
for P in combinations(range(w_len),colnum):
R={}
for i in range(len(T[0])):
C = [col[i] for col in T]
R[i] = S[C[0]][P[0]]
for ind,pos in enumerate(P[1:]):
R[i] = R[i] & S[C[ind+1]][pos]
if not R[i]:
break
else :
print_reddit_comment_solution(R,P)


if __name__ == "__main__":
W =get_words()
text = "reddit is now wordhacking"
make_wordhacking(text,W)







Sometimes it will generate a lot of solutions, sometimes not at all. Try to choose an input text that has a length with a lot of factors.