Levenshtein Distance

Tue. December 13, 2011
Categories: PHP
Tags: ,
 
  function levenshtein_distance ($this = "", $that = "") {
    /*
     * Levenshtein Distance
     * by Andrew Brown
     * Written Tuesday, September 8, 2009
     * Revised Friday, October 21, 2011 to add individual transformation costs
     *
     * Calculate the Levenshtein Distance between two strings with default costs.
     * The Levenshtein Distance calculates the total number of alterations that
     * would need to be made on one string to transform it into another, where
     * the only possible alterations are insertion, deletion, replacement, twiddle,
     * and kill.
     *
     * For more information, consult the PHP Documentation:
     * http://us2.php.net/manual/en/function.levenshtein.php
     * or the Wikipedia article:
     * http://en.wikipedia.org/wiki/Levenshtein_distance
     */
 
    // Costs
    $insertion_cost   = 1;
    $deletion_cost    = 1;
    $replacement_cost = 2;
    $copy_cost        = 0;
    $twiddle_cost     = 100;
    $kill_cost        = 100;
 
    // Equal strings?
    if ($this === $that)
      return 0;
 
    // Dimensions of the matrix
    $m = strlen($this);
    $q = strlen($that);
 
    // If either string is empty, the distance will be composed of only insertions or
    // deletions, based on which string is empty.
    if ($m == 0) return $q * $insertion_cost;
    if ($q == 0) return $m * $deletion_cost;
 
    // Start creating the matrix now
    // Top-left should be a 0, always
    $matrix[0][0] = 0;
 
    // Finish making the array in the next two for loops
    for ($x = 1; $x <= $m; $x++) $matrix[$x][0] = $x;
    for ($y = 1; $y <= $q; $y++) $matrix[0][$y] = $y;
 
    // Now go through the matrix and take the path with the least total cost
    // (Cost = number in each matrix cell).  Place the least total cost in the
    // bottom-right cell in the matrix.
    for ($i = 1; $i <= $m; $i++) {
       for ($j = 1; $j <= $q; $j++) {
         // Before anything, the two characters need to be compared. If they're equal,
         // no transformation is needed.
         if ($this[$i - 1] == $that[$j - 1]) {
           $matrix[$i][$j] = $matrix[$i - 1][$j - 1] + $copy_cost;
           continue;
         }
 
         // Assign $minimum a value that is larger than any possible value
         $minimum = $m + $q;
 
         // Look at all possible transformations we can do, and choose the one with
         // the smallest cost to do so:
         $matrix[$i][$j] = min(
 
           // Insertion   
           $matrix[$i - 1][$j] + $insertion_cost,
 
           // Deletion
           $matrix[$i][$j - 1] + $deletion_cost,
 
           // Replacement
           $matrix[$i - 1][$j - 1] + $replacement_cost
 
         );
 
       }
    }
 
    // Return the bottom-right-most element in the table, which represents the cost of
    // the path that takes the least effort (measured by the weights above).
    return $matrix[$m][$q];
  }