#!/usr/bin/env python ########################################################## import os import re ########################################################## class TextBoxWriter(object): def __init__(self, file, start_box=0): self.file = file self.need_header = True self.box = start_box self.line = 0 def write(self, text): if self.need_header: self.write_header() self.file.write("%s\n" % (text)) self.line += 1 if self.line >= 8: self.line = 0 self.box += 1 self.need_header = True def write_header(self): header = """ ====================================== Box %d -------------------------------------- """.strip() % (self.box) self.file.write("%s\n" % (header)) self.need_header = False ########################################################## os.system("cat /usr/share/dict/american-english-large | sort > american-english-large.sorted") src = open("american-english-large.sorted", "r") dest = open("vocabmosaic-textboxes.txt", "w") # start at text box 10 to leave a few for other purposes textbox = TextBoxWriter(dest, 10) posessive = re.compile(r".*'s$", re.I) letters = re.compile(r"^[A-Z]+$", re.I) short = ["AA", "AB", "AD", "AE", "AG", "AH", "AI", "AL", "AM", "AN", "AR", "AS", "AT", "AW", "AX", "AY", "BA", "BE", "BI", "BO", "BY", "DE", "DO", "ED", "EF", "EH", "EL", "EM", "EN", "ER", "ES", "ET", "EX", "FA", "GO", "HA", "HE", "HI", "HM", "HO", "ID", "IF", "IN", "IS", "IT", "JO", "KA", "LA", "LI", "LO", "MA", "ME", "MI", "MM", "MO", "MU", "MY", "NA", "NE", "NO", "NU", "OD", "OE", "OF", "OH", "OM", "ON", "OP", "OR", "OS", "OW", "OX", "OY", "PA", "PE", "PI", "RE", "SH", "SI", "SO", "TA", "TI", "TO", "UH", "UM", "UN", "UP", "US", "UT", "WE", "WO", "XI", "XU", "YA", "YE", "YO"] for line in src: word = line.strip() if len(word) <= 1: # ignore single letters continue if len(word) > 38: # ignore words that are too long to fit in a text box line, print "ignored %s" % (word) continue if len(word) == 2 and word.upper() not in short: print "ignored %s" % (word) continue if posessive.match(word): # ignore anything that ends in apostrophe s continue if not letters.match(word): # ignore any word that contains accented letters or other symbols # but print them out of curiosity... print "ignored %s" % (word) continue word = word.upper() textbox.write(word)