Levenshtein Distance
Tue. December 13, 2011Categories: PHP
Tags: edit distance, levenshtein
function levenshtein_distance ($this = "", $that = "") { /* * Levenshtein Distance * by Andrew Brown * Written Tuesday, September 8, 2009 * Revised Friday, October 21, 2011 to add individual transformation costs * * Calculate the Levenshtein Distance between two strings with default costs. * The Levenshtein Distance calculates the total number of alterations that * would need to be made on one string to transform it into another, where * the only possible alterations are insertion, deletion, replacement, twiddle, * and kill. * * For more information, consult the PHP Documentation: * http://us2.php.net/manual/en/function.levenshtein.php * or the Wikipedia article: * http://en.wikipedia.org/wiki/Levenshtein_distance */ // Costs $insertion_cost = 1; $deletion_cost = 1; $replacement_cost = 2; $copy_cost = 0; $twiddle_cost = 100; $kill_cost = 100; // Equal strings? if ($this === $that) return 0; // Dimensions of the matrix $m = strlen($this); $q = strlen($that); // If either string is empty, the distance will be composed of only insertions or // deletions, based on which string is empty. if ($m == 0) return $q * $insertion_cost; if ($q == 0) return $m * $deletion_cost; // Start creating the matrix now // Top-left should be a 0, always $matrix[0][0] = 0; // Finish making the array in the next two for loops for ($x = 1; $x <= $m; $x++) $matrix[$x][0] = $x; for ($y = 1; $y <= $q; $y++) $matrix[0][$y] = $y; // Now go through the matrix and take the path with the least total cost // (Cost = number in each matrix cell). Place the least total cost in the // bottom-right cell in the matrix. for ($i = 1; $i <= $m; $i++) { for ($j = 1; $j <= $q; $j++) { // Before anything, the two characters need to be compared. If they're equal, // no transformation is needed. if ($this[$i - 1] == $that[$j - 1]) { $matrix[$i][$j] = $matrix[$i - 1][$j - 1] + $copy_cost; continue; } // Assign $minimum a value that is larger than any possible value $minimum = $m + $q; // Look at all possible transformations we can do, and choose the one with // the smallest cost to do so: $matrix[$i][$j] = min( // Insertion $matrix[$i - 1][$j] + $insertion_cost, // Deletion $matrix[$i][$j - 1] + $deletion_cost, // Replacement $matrix[$i - 1][$j - 1] + $replacement_cost ); } } // Return the bottom-right-most element in the table, which represents the cost of // the path that takes the least effort (measured by the weights above). return $matrix[$m][$q]; }