MKoD - Soundex Example

MKoD - D Programming Language

Soundex example - code-name toSoundex.d

Converting surnames to a Soundex code value:

/+
 ' Source        : toSoundex.d - Converts a word into a Soundex Code.
 ' Version       : v0.1 Beta
 ' Author        : David L. 'SpottedTiger' Davis
 ' Date Created  : 14.Oct.04 Compiled and Tested with dmd v0.102
 ' Date Modified : 28.Feb.05 Compiled and Tested with dmd v0.114
 '               : 04.Jun.06 Compiled and Tested with dmd v0.160
 '               : 10.Jan.07 Compiled and Tested with dmd v1.0
 '               :
 ' Licence       : Pubic Domain
 ' --------------------------------------------------------------------
 '
 ' Soundex Rules based on this Web Site:
 ' http://www.archives.gov/research_room/genealogy/census/soundex.html
 ' --------------------------------------------------------------------
 ' C:\dmd\MKoD_ex>dmd toSoundex.d -debug=test -unittest
 +/
private import std.string;
private import std.stdio;

/+
 ' char[] toSoundex( in char[] )
 '
 ' Return the soundex code for a given string based upon its phonetic sound.
 +/
char[] toSoundex( in char[] s ) 
{
    char[] sWord = toupper( strip( s ) );
    char[] sSoundex;
    char   cPrevVal;
    char   cPrevVal2;

    /+ Return a transformed (A thru Z) ASCII alphabetic  
     ' character to its numeric appropriate Soundex Code 
     ' value or special character.
     '                 A,E,I,O,U,Y     = '/'
     '                 H,W             = '*'
     '                 B,F,P,V         = '1'
     '                 C,G,J,K,Q,S,X,Z = '2'
     '                 D,T             = '3'
     '                 L               = '4'
     '                 M,N             = '5'
     '                 R               = '6'
     '                 All others      = '-'
     +/
    static char getSoundexFormattingCode( in char c ) 
    {
        switch ( c ) 
        {
          case 'A','E','I','O','U','Y': 
              return '/';  
              
          case 'H','W': 
              return '*'; 
              
          case 'B','F','P','V': 
              return '1';
              
          case 'C','G','J','K','Q','S','X','Z':
              return '2';
              
          case 'D','T':
              return '3';
        
          case 'L':
              return '4';
            
          case 'M','N':
              return '5';
            
          case 'R':
              return '6';
            
          default:    
              break;
        }

        return '-'; 
        
    } // end char getSoundexFormatingCode( in char )    
    
    // Keep the first alphabetic character
    sSoundex.length = 1;
    sSoundex[ 0 ] = sWord[ 0 ];
    
    // Transform the characters to a Soundex Code or 
    // a special processing character.
    for ( uint uix = 0; uix < sWord.length; uix++ )
        sWord[ uix ] = getSoundexFormattingCode( sWord[ uix ] );
    
    // Apply a number of the Soundex rules for double characters
    // and side-to-side Soundex code issues
    for ( uint uix = 1; uix < sWord.length; uix++ )
    {
        //writef( "%s", sWord[ uix ] );

        if ( sWord[ uix - 1 ] == sWord[ uix ] || cPrevVal == sWord[ uix ] )
        {
           cPrevVal     = sWord[ uix ];
           sWord[ uix ] = '-';
        }    
        
        if ( uix - 2 >= 0 )
            if ( cPrevVal2 == '*' )
                if ( sWord[ uix - 2 ] == sWord[ uix ] )
                    sWord[ uix ] = '-';

        if ( sWord[ uix ] == '/' && cPrevVal != '*' )
            cPrevVal = '\x20';

        cPrevVal2 = sWord[ uix ];
    }

    // Filter out the unwanted character to create the final Soundex String
    for ( uint uix = 1; uix < sWord.length; uix++ )
        if ( sWord[ uix ] != '/' &&  sWord[ uix ] != '*' && sWord[ uix ] != '-' )
            sSoundex ~= sWord[ uix ];            

    // If Soundex Code is shorter then four characters append four zeros.
    if ( sSoundex.length < 4 )
        sSoundex ~= "0000";
    
    //writefln();
    
    // Return only the first four characters
    return sSoundex[ 0 .. 4 ].dup; 
    
} // end char[] toSoundex( in char[] )

unittest
{
    debug( Test ) writefln( "toSoundex( char[] ).unittest" );

    // Sound-Alike Names 
    assert( "F626" == toSoundex( "fraser" ) );
    assert( "F626" == toSoundex( "frazier" ) );
    
    assert( "V230" == toSoundex( "Vogt" ) );
    assert( "V230" == toSoundex( "Voght" ) );
    assert( "V230" == toSoundex( "Voight" ) );
    
    assert( "F230" == toSoundex( "Fogt" ) );
    assert( "F230" == toSoundex( "Foght" ) );
    assert( "F230" == toSoundex( "Foight" ) );
    //----------------------------------------
    
    // Normal examples
    assert( "W252" == toSoundex( "Washington" )  );
    assert( "E460" == toSoundex( "Euler" ) );
    assert( "H416" == toSoundex( "Hilbert" ) );
    assert( "K530" == toSoundex( "Knuth" ) );
    assert( "L222" == toSoundex( "Lukasiewicz" ) );
    assert( "V532" == toSoundex( "VanDeusen" ) );
    assert( "N253" == toSoundex( "Naesmyth" ) );  
    assert( "B630" == toSoundex( "Baird" ) );

    // When 2 or more of the same key letters (or their number 
    // equivalents) come together, they should be treated as 
    // ONE key letter.
    assert( "J250" == toSoundex( "Jackson" ) );
    assert( "S163" == toSoundex( "Sheppard" ) );
    assert( "S200" == toSoundex( "Sacks" ) );
    assert( "S640" == toSoundex( "Sherrell" ) );
    assert( "L300" == toSoundex( "Lloyd" ) );
    assert( "G362" == toSoundex( "Gutierrez" ) ); 
    assert( "C450" == toSoundex( "Callahan" ) );
    assert( "G200" == toSoundex( "Gauss" ) );

    // A key letter or its equivalent immediately following an 
    // initial letter of the same value should be disregarded 
    // in coding.
    assert( "P236" == toSoundex( "Pfister" ) );
    assert( "S232" == toSoundex( "Schechwitz" ) );
    assert( "S240" == toSoundex( "Szakal" ) );
    assert( "S460" == toSoundex( "Scklar" ) ); 
    
    // Key letters or their equivalents separated by H or W 
    // are coded as one key letter.   
    assert( "S200" == toSoundex( "Sokwzy" ) ); 
    assert( "S452" == toSoundex( "Schkolink" ) ); 
    assert( "S460" == toSoundex( "Schklar" ) ); 
    assert( "A261" == toSoundex( "Ashcraft" ) );
    assert( "S432" == toSoundex( "Schultz" ) );
    
    // When a repeated key letter or its equivalent is separated 
    // by an A, E, I, O, U or Y, the key letters or their 
    // equivalents are considered separately.
    assert( "T522" == toSoundex( "Tymczak" ) );
    assert( "S335" == toSoundex( "Staten" ) ); 
    assert( "S422" == toSoundex( "Salkiewicz" ) ); 
    assert( "S550" == toSoundex( "Simone" ) ); 
    
    // Some names do not contain any of the six key letters or their 
    // equivalents - that is they have only vowels A, E, I, O, U or 
    // W, H, Y, after the first letter of the name. Such names are 
    // filed in a group by themselves in back of their letter guide.
    assert( "S000" == toSoundex( "Shea" ) ); 
    assert( "L000" == toSoundex( "Lee" ) );
    
    // Testing variations of the Womack surname 
    assert( "W520" == toSoundex( "Wamac" ) ); 
    assert( "W520" == toSoundex( "Wamack" ) ); 
    assert( "W520" == toSoundex( "Wammack" ) ); 
    assert( "W520" == toSoundex( "Wammuck" ) ); 
    assert( "W520" == toSoundex( "Womac" ) ); 
    assert( "W520" == toSoundex( "Womach" ) ); 
    assert( "W520" == toSoundex( "Womich" ) ); 
    assert( "W520" == toSoundex( "Womick" ) ); 
    assert( "W520" == toSoundex( "Wommac" ) ); 
    assert( "W520" == toSoundex( "Wommack" ) );    
    assert( "W520" == toSoundex( "Wommuck" ) );    
    assert( "W520" == toSoundex( "Wonack" ) );    
    assert( "W520" == toSoundex( "Woomack" ) );     
}

int main()
{
    debug( Test ) writefln( "unittest done." );
 
    // The soundex code for "fraser" and "frazier"
    // should be F626, since they sound phonetically
    // the same.
    writefln( "fraser=%s, F626",  toSoundex( "fraser"  ) );
    writefln( "frazier=%s, F626", toSoundex( "frazier" ) );
    
    writefln( "Euler=%s, E460", toSoundex( "Euler" ) );
    writefln( "Gauss=%s, G200", toSoundex( "Gauss" ) );
    writefln( "Hilbert=%s, H416", toSoundex( "Hilbert" ) );
    writefln( "Knuth=%s, K530", toSoundex( "Knuth" ) );
    writefln( "Lloyd=%s, L300", toSoundex( "Lloyd" ) );
    writefln( "Lukasiewicz=%s, L222", toSoundex( "Lukasiewicz" ) );
    writefln( "Washington=%s, W252", toSoundex( "Washington" ) );
    writefln( "Lee=%s, L000", toSoundex( "Lee" ) );
    writefln( "Gutierrez=%s, G362", toSoundex( "Gutierrez" ) );
    writefln( "Pfister=%s, P236", toSoundex( "Pfister" ) );
    writefln( "Jackson=%s, J250", toSoundex( "Jackson" ) );
    writefln( "Tymczak=%s, T522", toSoundex( "Tymczak" ) );
    writefln( "VanDeusen=%s, V532", toSoundex( "VanDeusen" ) );
    writefln( "Ashcraft=%s, A261", toSoundex( "Ashcraft" ) );
    writefln( "Naesmyth=%s, N253", toSoundex( "Naesmyth" ) );
    writefln( "Baird=%s, B630", toSoundex( "Baird" ) );
    writefln( "Callahan=%s, C450", toSoundex( "Callahan" ) );
    writefln( "Schultz=%s, S432", toSoundex( "Schultz" ) );
    
    // When 2 or more of the same key letters (or their number equivalents) 
    // come together, they should be treated as ONE key letter.
    writefln( "Sheppard=%s, S163", toSoundex( "Sheppard" ) );
    writefln( "Sacks=%s, S200", toSoundex( "Sacks" ) );
    writefln( "Sherrell=%s, S640", toSoundex( "Sherrell" ) );
    writefln( "Lloyd=%s, L300", toSoundex( "Lloyd" ) );
    
    // A key letter or its equivalent immediately following an initial 
    // letter of the same value should be disregarded in coding.
    writefln( "Schechwitz=%s, S232", toSoundex( "Schechwitz" ) );
    writefln( "Szakal=%s, S240", toSoundex( "Szakal" ) );
    writefln( "Scklar=%s, S460", toSoundex( "Scklar" ) ); 
    
    // Key letters or their equivalents separated by H or W 
    // are coded as one key letter.   
    writefln( "Sokwzy=%s, S200", toSoundex( "Sokwzy" ) ); 
    writefln( "Schkolink=%s, S452", toSoundex( "Schkolink" ) ); 
    writefln( "Schklar=%s, S460", toSoundex( "Schklar" ) ); 
    
    // When a repeated key letter or its equivalent is separated 
    // by an A, E, I, O, U or Y, the key letters or their equivalents 
    // are considered separately.
    writefln( "Staten=%s, S335", toSoundex( "Staten" ) ); 
    writefln( "Salkiewicz=%s, S422", toSoundex( "Salkiewicz" ) ); 
    writefln( "Simone=%s, S550", toSoundex( "Simone" ) ); 
    
    // Some names do not contain any of the six key letters or their 
    // equivalents - that is they have only vowels A, E, I, O, U or 
    // W, H, Y, after the first letter of the name. Such names are 
    // filed in a group by themselves in back of their letter guide.
    writefln( "Shea=%s, S000", toSoundex( "Shea" ) ); 
    writefln( "Lee=%s, L000", toSoundex( "Lee" ) );
        
    return 0;
}

C:\dmd\MKoD_ex>..\bin\dmd tosoundex.d
C:\dmd\bin\..\..\dm\bin\link.exe tosoundex,,,user32+kernel32/noi;

C:\dmd\MKoD_ex>tosoundex
fraser=F626, F626
frazier=F626, F626
Euler=E460, E460
Gauss=G200, G200
Hilbert=H416, H416
Knuth=K530, K530
Lloyd=L300, L300
Lukasiewicz=L222, L222
Washington=W252, W252
Lee=L000, L000
Gutierrez=G362, G362
Pfister=P236, P236
Jackson=J250, J250
Tymczak=T522, T522
VanDeusen=V532, V532
Ashcraft=A261, A261
Naesmyth=N253, N253
Baird=B630, B630
Callahan=C450, C450
Schultz=S432, S432
Sheppard=S163, S163
Sacks=S200, S200
Sherrell=S640, S640
Lloyd=L300, L300
Schechwitz=S232, S232
Szakal=S240, S240
Scklar=S460, S460
Sokwzy=S200, S200
Schkolink=S452, S452
Schklar=S460, S460
Staten=S335, S335
Salkiewicz=S422, S422
Simone=S550, S550
Shea=S000, S000
Lee=L000, L000

C:\dmd\MKoD_ex>