How to Write a Spelling Corrector (incidentally, how cool is python!)

// January 8th, 2010 // Tech

import re, collections

def words(text): return re.findall('[a-z]+', text.lower())

def train(features):

model = collections.defaultdict(lambda: 1)

for f in features:

model[f] += 1

return model

NWORDS = train(words(file('big.txt').read()))

alphabet = 'abcdefghijklmnopqrstuvwxyz'

def edits1(word):

splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]

deletes    = [a + b[1:] for a, b in splits if b]

transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]

replaces   = [a + c + b[1:] for a, b in splits for c in alphabet if b]

inserts    = [a + c + b     for a, b in splits for c in alphabet]

return set(deletes + transposes + replaces + inserts)

def known_edits2(word):

return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)

def known(words): return set(w for w in words if w in NWORDS)

def correct(word):

candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]

return max(candidates, key=NWORDS.get)

Oh man, try implementing this is C++

via How to Write a Spelling Corrector.

Related posts:

  1. Charlie Brooker | iPad therefore iWant? Probably. Why? -The Guardian
  2. Japanese style dessert: Not-so-sweet Tsubu-an: Japanese Azuki Bean Paste
  3. The Royal Fellowship
  4. Britain tastes better when it’s swaddled in Cadbury’s Dairy Milk chocolate
  5. Pakistani ambassador rejected because his name is NSFW in Arabic – Arab world lives up to its reputation


Leave a Reply

Add to Google Subscribe with Bloglines

Subscribe in a reader

Get Adobe Flash playerPlugin by wpburn.com wordpress themes