The US census bureau uses a special encoding called “soundex” to locate information about a person. The soundex is an encoding of surnames (last names) based on the way a su
surname = input("Enter surname of the author: ") #asks user to input the author's surname
while surname != "": #initiates a while loop thats loops on as long as the input is not equal to an empty line
str_ini = surname[0] #denotes the initial letter of the surname string
mod_str1 = surname[1:] #denotes modified string excluding the first letter of the surname
import re #importing re module to access the sub function
mod_str2 = re.sub(r'[aeiouyhwAEIOUYHW]', '', mod_str1) #eliminating any instances of the given letters
mod_str21 = re.sub(r'[bfpvBFPV]', '1', mod_str2)
mod_str22 = re.sub(r'[cgjkqsxzCGJKQSXZ]', '2', mod_str21)
mod_str23 = re.sub(r'[dtDT]', '3', mod_str22)
mod_str24 = re.sub(r'[lL]', '4', mod_str23)
mod_str25 = re.sub(r'[mnMN]', '5', mod_str24)
mod_str26 = re.sub(r'[rR]', '6', mod_str25)
#substituting given letters with specific numbers as required by the soundex algorithm
mod_str3 = str_ini.upper()+mod_str26 #appending the surname initial with the remaining modified trunk
import itertools #importing itertools module to access the groupby function
mod_str4 = ''.join(char for char, rep in itertools.groupby(mod_str3))
#grouping each character of the string into individual characters
#removing sequences of identical numbers with a single number
#joining the individually grouped characters into a string
mod_str5 = (mod_str4[:4]) #setting character limit of the modified string upto the fourth place
if len (mod_str5) == 1:
print (mod_str5 + "000\n")
elif len (mod_str5) == 2:
print (mod_str5 + "00\n")
elif len (mod_str5) == 3:
print (mod_str5 + "0\n")
else:
print (mod_str5 + "\n")
#using if, elif and else arguments for padding with trailing zeros
print ("Press enter to exit") #specification for the interactor, to press enter (i.e., equivalent to a new line for breaking the while loop) when he wants to exit the program
surname = input("Enter surname of the author: ") #asking next input from the user if he wants to carry on
exit(0) #exiting the program at the break of the while loop
This is hardly perfect (for instance, it produces the wrong result if the input doesn't start with a letter), and it doesn't implement the rules as independently-testable functions, so it's not really going to serve as an answer to the homework question. But this is how I'd implement it:
>>> def soundex_prepare(s):
"""Prepare string for Soundex encoding.
Remove non-alpha characters (and the not-of-interest W/H/Y),
convert to upper case, and remove all runs of repeated letters."""
p = re.compile("[^a-gi-vxz]", re.IGNORECASE)
s = re.sub(p, "", s).upper()
for c in set(s):
s = re.sub(c + "{2,}", c, s)
return s
>>> def soundex_encode(s):
"""Encode a name string using the Soundex algorithm."""
result = s[0].upper()
s = soundex_prepare(s[1:])
letters = 'ABCDEFGIJKLMNOPQRSTUVXZ'
codes = '.123.12.22455.12623.122'
d = dict(zip(letters, codes))
prev_code=""
for c in s:
code = d[c]
if code != "." and code != prev_code:
result += code
if len(result) >= 4: break
prev_code = code
return (result + "0000")[:4]
I would suggest you try the following.
Once you break it down nicely it should become easier to manage.