Converting surnames to a Soundex code value:
/+
' Source : toSoundex.d - Converts a word into a Soundex Code.
' Version : v0.1 Beta
' Author : David L. 'SpottedTiger' Davis
' Date Created : 14.Oct.04 Compiled and Tested with dmd v0.102
' Date Modified : 28.Feb.05 Compiled and Tested with dmd v0.114
' : 04.Jun.06 Compiled and Tested with dmd v0.160
' : 10.Jan.07 Compiled and Tested with dmd v1.0
' :
' Licence : Pubic Domain
' --------------------------------------------------------------------
'
' Soundex Rules based on this Web Site:
' http://www.archives.gov/research_room/genealogy/census/soundex.html
' --------------------------------------------------------------------
' C:\dmd\MKoD_ex>dmd toSoundex.d -debug=test -unittest
+/
private import std.string;
private import std.stdio;
/+
' char[] toSoundex( in char[] )
'
' Return the soundex code for a given string based upon its phonetic sound.
+/
char[] toSoundex( in char[] s )
{
char[] sWord = toupper( strip( s ) );
char[] sSoundex;
char cPrevVal;
char cPrevVal2;
/+ Return a transformed (A thru Z) ASCII alphabetic
' character to its numeric appropriate Soundex Code
' value or special character.
' A,E,I,O,U,Y = '/'
' H,W = '*'
' B,F,P,V = '1'
' C,G,J,K,Q,S,X,Z = '2'
' D,T = '3'
' L = '4'
' M,N = '5'
' R = '6'
' All others = '-'
+/
static char getSoundexFormattingCode( in char c )
{
switch ( c )
{
case 'A','E','I','O','U','Y':
return '/';
case 'H','W':
return '*';
case 'B','F','P','V':
return '1';
case 'C','G','J','K','Q','S','X','Z':
return '2';
case 'D','T':
return '3';
case 'L':
return '4';
case 'M','N':
return '5';
case 'R':
return '6';
default:
break;
}
return '-';
} // end char getSoundexFormatingCode( in char )
// Keep the first alphabetic character
sSoundex.length = 1;
sSoundex[ 0 ] = sWord[ 0 ];
// Transform the characters to a Soundex Code or
// a special processing character.
for ( uint uix = 0; uix < sWord.length; uix++ )
sWord[ uix ] = getSoundexFormattingCode( sWord[ uix ] );
// Apply a number of the Soundex rules for double characters
// and side-to-side Soundex code issues
for ( uint uix = 1; uix < sWord.length; uix++ )
{
//writef( "%s", sWord[ uix ] );
if ( sWord[ uix - 1 ] == sWord[ uix ] || cPrevVal == sWord[ uix ] )
{
cPrevVal = sWord[ uix ];
sWord[ uix ] = '-';
}
if ( uix - 2 >= 0 )
if ( cPrevVal2 == '*' )
if ( sWord[ uix - 2 ] == sWord[ uix ] )
sWord[ uix ] = '-';
if ( sWord[ uix ] == '/' && cPrevVal != '*' )
cPrevVal = '\x20';
cPrevVal2 = sWord[ uix ];
}
// Filter out the unwanted character to create the final Soundex String
for ( uint uix = 1; uix < sWord.length; uix++ )
if ( sWord[ uix ] != '/' && sWord[ uix ] != '*' && sWord[ uix ] != '-' )
sSoundex ~= sWord[ uix ];
// If Soundex Code is shorter then four characters append four zeros.
if ( sSoundex.length < 4 )
sSoundex ~= "0000";
//writefln();
// Return only the first four characters
return sSoundex[ 0 .. 4 ].dup;
} // end char[] toSoundex( in char[] )
unittest
{
debug( Test ) writefln( "toSoundex( char[] ).unittest" );
// Sound-Alike Names
assert( "F626" == toSoundex( "fraser" ) );
assert( "F626" == toSoundex( "frazier" ) );
assert( "V230" == toSoundex( "Vogt" ) );
assert( "V230" == toSoundex( "Voght" ) );
assert( "V230" == toSoundex( "Voight" ) );
assert( "F230" == toSoundex( "Fogt" ) );
assert( "F230" == toSoundex( "Foght" ) );
assert( "F230" == toSoundex( "Foight" ) );
//----------------------------------------
// Normal examples
assert( "W252" == toSoundex( "Washington" ) );
assert( "E460" == toSoundex( "Euler" ) );
assert( "H416" == toSoundex( "Hilbert" ) );
assert( "K530" == toSoundex( "Knuth" ) );
assert( "L222" == toSoundex( "Lukasiewicz" ) );
assert( "V532" == toSoundex( "VanDeusen" ) );
assert( "N253" == toSoundex( "Naesmyth" ) );
assert( "B630" == toSoundex( "Baird" ) );
// When 2 or more of the same key letters (or their number
// equivalents) come together, they should be treated as
// ONE key letter.
assert( "J250" == toSoundex( "Jackson" ) );
assert( "S163" == toSoundex( "Sheppard" ) );
assert( "S200" == toSoundex( "Sacks" ) );
assert( "S640" == toSoundex( "Sherrell" ) );
assert( "L300" == toSoundex( "Lloyd" ) );
assert( "G362" == toSoundex( "Gutierrez" ) );
assert( "C450" == toSoundex( "Callahan" ) );
assert( "G200" == toSoundex( "Gauss" ) );
// A key letter or its equivalent immediately following an
// initial letter of the same value should be disregarded
// in coding.
assert( "P236" == toSoundex( "Pfister" ) );
assert( "S232" == toSoundex( "Schechwitz" ) );
assert( "S240" == toSoundex( "Szakal" ) );
assert( "S460" == toSoundex( "Scklar" ) );
// Key letters or their equivalents separated by H or W
// are coded as one key letter.
assert( "S200" == toSoundex( "Sokwzy" ) );
assert( "S452" == toSoundex( "Schkolink" ) );
assert( "S460" == toSoundex( "Schklar" ) );
assert( "A261" == toSoundex( "Ashcraft" ) );
assert( "S432" == toSoundex( "Schultz" ) );
// When a repeated key letter or its equivalent is separated
// by an A, E, I, O, U or Y, the key letters or their
// equivalents are considered separately.
assert( "T522" == toSoundex( "Tymczak" ) );
assert( "S335" == toSoundex( "Staten" ) );
assert( "S422" == toSoundex( "Salkiewicz" ) );
assert( "S550" == toSoundex( "Simone" ) );
// Some names do not contain any of the six key letters or their
// equivalents - that is they have only vowels A, E, I, O, U or
// W, H, Y, after the first letter of the name. Such names are
// filed in a group by themselves in back of their letter guide.
assert( "S000" == toSoundex( "Shea" ) );
assert( "L000" == toSoundex( "Lee" ) );
// Testing variations of the Womack surname
assert( "W520" == toSoundex( "Wamac" ) );
assert( "W520" == toSoundex( "Wamack" ) );
assert( "W520" == toSoundex( "Wammack" ) );
assert( "W520" == toSoundex( "Wammuck" ) );
assert( "W520" == toSoundex( "Womac" ) );
assert( "W520" == toSoundex( "Womach" ) );
assert( "W520" == toSoundex( "Womich" ) );
assert( "W520" == toSoundex( "Womick" ) );
assert( "W520" == toSoundex( "Wommac" ) );
assert( "W520" == toSoundex( "Wommack" ) );
assert( "W520" == toSoundex( "Wommuck" ) );
assert( "W520" == toSoundex( "Wonack" ) );
assert( "W520" == toSoundex( "Woomack" ) );
}
int main()
{
debug( Test ) writefln( "unittest done." );
// The soundex code for "fraser" and "frazier"
// should be F626, since they sound phonetically
// the same.
writefln( "fraser=%s, F626", toSoundex( "fraser" ) );
writefln( "frazier=%s, F626", toSoundex( "frazier" ) );
writefln( "Euler=%s, E460", toSoundex( "Euler" ) );
writefln( "Gauss=%s, G200", toSoundex( "Gauss" ) );
writefln( "Hilbert=%s, H416", toSoundex( "Hilbert" ) );
writefln( "Knuth=%s, K530", toSoundex( "Knuth" ) );
writefln( "Lloyd=%s, L300", toSoundex( "Lloyd" ) );
writefln( "Lukasiewicz=%s, L222", toSoundex( "Lukasiewicz" ) );
writefln( "Washington=%s, W252", toSoundex( "Washington" ) );
writefln( "Lee=%s, L000", toSoundex( "Lee" ) );
writefln( "Gutierrez=%s, G362", toSoundex( "Gutierrez" ) );
writefln( "Pfister=%s, P236", toSoundex( "Pfister" ) );
writefln( "Jackson=%s, J250", toSoundex( "Jackson" ) );
writefln( "Tymczak=%s, T522", toSoundex( "Tymczak" ) );
writefln( "VanDeusen=%s, V532", toSoundex( "VanDeusen" ) );
writefln( "Ashcraft=%s, A261", toSoundex( "Ashcraft" ) );
writefln( "Naesmyth=%s, N253", toSoundex( "Naesmyth" ) );
writefln( "Baird=%s, B630", toSoundex( "Baird" ) );
writefln( "Callahan=%s, C450", toSoundex( "Callahan" ) );
writefln( "Schultz=%s, S432", toSoundex( "Schultz" ) );
// When 2 or more of the same key letters (or their number equivalents)
// come together, they should be treated as ONE key letter.
writefln( "Sheppard=%s, S163", toSoundex( "Sheppard" ) );
writefln( "Sacks=%s, S200", toSoundex( "Sacks" ) );
writefln( "Sherrell=%s, S640", toSoundex( "Sherrell" ) );
writefln( "Lloyd=%s, L300", toSoundex( "Lloyd" ) );
// A key letter or its equivalent immediately following an initial
// letter of the same value should be disregarded in coding.
writefln( "Schechwitz=%s, S232", toSoundex( "Schechwitz" ) );
writefln( "Szakal=%s, S240", toSoundex( "Szakal" ) );
writefln( "Scklar=%s, S460", toSoundex( "Scklar" ) );
// Key letters or their equivalents separated by H or W
// are coded as one key letter.
writefln( "Sokwzy=%s, S200", toSoundex( "Sokwzy" ) );
writefln( "Schkolink=%s, S452", toSoundex( "Schkolink" ) );
writefln( "Schklar=%s, S460", toSoundex( "Schklar" ) );
// When a repeated key letter or its equivalent is separated
// by an A, E, I, O, U or Y, the key letters or their equivalents
// are considered separately.
writefln( "Staten=%s, S335", toSoundex( "Staten" ) );
writefln( "Salkiewicz=%s, S422", toSoundex( "Salkiewicz" ) );
writefln( "Simone=%s, S550", toSoundex( "Simone" ) );
// Some names do not contain any of the six key letters or their
// equivalents - that is they have only vowels A, E, I, O, U or
// W, H, Y, after the first letter of the name. Such names are
// filed in a group by themselves in back of their letter guide.
writefln( "Shea=%s, S000", toSoundex( "Shea" ) );
writefln( "Lee=%s, L000", toSoundex( "Lee" ) );
return 0;
}
C:\dmd\MKoD_ex>..\bin\dmd tosoundex.d
C:\dmd\bin\..\..\dm\bin\link.exe tosoundex,,,user32+kernel32/noi;
C:\dmd\MKoD_ex>tosoundex
fraser=F626, F626
frazier=F626, F626
Euler=E460, E460
Gauss=G200, G200
Hilbert=H416, H416
Knuth=K530, K530
Lloyd=L300, L300
Lukasiewicz=L222, L222
Washington=W252, W252
Lee=L000, L000
Gutierrez=G362, G362
Pfister=P236, P236
Jackson=J250, J250
Tymczak=T522, T522
VanDeusen=V532, V532
Ashcraft=A261, A261
Naesmyth=N253, N253
Baird=B630, B630
Callahan=C450, C450
Schultz=S432, S432
Sheppard=S163, S163
Sacks=S200, S200
Sherrell=S640, S640
Lloyd=L300, L300
Schechwitz=S232, S232
Szakal=S240, S240
Scklar=S460, S460
Sokwzy=S200, S200
Schkolink=S452, S452
Schklar=S460, S460
Staten=S335, S335
Salkiewicz=S422, S422
Simone=S550, S550
Shea=S000, S000
Lee=L000, L000
C:\dmd\MKoD_ex>