こんにちは!今回は、PHPの標準関数であるsimilar_text()について詳しく解説していきます。2つの文字列がどれだけ似ているかを数値で計算できる、非常に便利な関数です!
similar_text関数とは?
similar_text()関数は、2つの文字列の類似度を計算する関数です。
共通する文字列の長さを基に類似度を算出し、パーセンテージで表現することもできます。スペルチェック、重複検出、曖昧検索などで活用できます!
基本的な構文
similar_text(
string $string1,
string $string2,
float &$percent = null
): int
- $string1: 比較する文字列1
- $string2: 比較する文字列2
- $percent: 類似度のパーセンテージ(参照渡し、オプション)
- 戻り値: 一致する文字数
基本的な使用例
シンプルな比較
$str1 = "Hello World";
$str2 = "Hello PHP";
// 一致する文字数を取得
$similarity = similar_text($str1, $str2);
echo "一致文字数: {$similarity}\n";
// 出力: 6 ("Hello " が一致)
// パーセンテージも取得
$similarity = similar_text($str1, $str2, $percent);
echo "一致文字数: {$similarity}\n";
echo "類似度: " . round($percent, 2) . "%\n";
// 出力: 類似度: 55.00%
完全一致と完全不一致
// 完全一致
$str1 = "Hello";
$str2 = "Hello";
similar_text($str1, $str2, $percent);
echo "完全一致: {$percent}%\n";
// 出力: 100%
// 完全不一致
$str1 = "ABC";
$str2 = "XYZ";
similar_text($str1, $str2, $percent);
echo "完全不一致: {$percent}%\n";
// 出力: 0%
大文字小文字の区別
$str1 = "Hello";
$str2 = "hello";
// 大文字小文字は区別される
similar_text($str1, $str2, $percent);
echo "異なる大文字小文字: {$percent}%\n";
// 出力: 80%
// 大文字小文字を無視する場合
similar_text(strtolower($str1), strtolower($str2), $percent);
echo "小文字に統一: {$percent}%\n";
// 出力: 100%
順序の影響
$str1 = "ABC";
$str2 = "CBA";
// 順序が異なると類似度が下がる
similar_text($str1, $str2, $percent);
echo "逆順: {$percent}%\n";
// 出力: 33.33%
部分一致
$str1 = "Programming";
$str2 = "Program";
similar_text($str1, $str2, $percent);
echo "部分一致: {$percent}%\n";
// 出力: 77.78%
// 逆の順序
similar_text($str2, $str1, $percent);
echo "逆の順序: {$percent}%\n";
// 出力: 63.64%(異なる結果になることに注意)
実践的な使用例
例1: テキスト類似度チェッカー
class SimilarityChecker {
/**
* 類似度を計算(大文字小文字無視)
*/
public static function calculate($text1, $text2, $caseSensitive = false) {
if (!$caseSensitive) {
$text1 = strtolower($text1);
$text2 = strtolower($text2);
}
$matches = similar_text($text1, $text2, $percent);
return [
'text1' => $text1,
'text2' => $text2,
'matches' => $matches,
'percent' => round($percent, 2),
'rating' => self::getRating($percent)
];
}
/**
* 類似度の評価を取得
*/
private static function getRating($percent) {
if ($percent >= 90) return '非常に類似';
if ($percent >= 70) return '類似';
if ($percent >= 50) return 'やや類似';
if ($percent >= 30) return 'わずかに類似';
return '類似なし';
}
/**
* 複数の文字列を比較
*/
public static function compareMultiple($target, $candidates) {
$results = [];
foreach ($candidates as $candidate) {
$result = self::calculate($target, $candidate);
$result['candidate'] = $candidate;
$results[] = $result;
}
// 類似度でソート
usort($results, function($a, $b) {
return $b['percent'] <=> $a['percent'];
});
return $results;
}
/**
* 最も類似した文字列を取得
*/
public static function findMostSimilar($target, $candidates, $threshold = 0) {
$results = self::compareMultiple($target, $candidates);
if (empty($results) || $results[0]['percent'] < $threshold) {
return null;
}
return $results[0]['candidate'];
}
/**
* 類似度マトリックスを生成
*/
public static function createMatrix($strings) {
$matrix = [];
foreach ($strings as $i => $str1) {
foreach ($strings as $j => $str2) {
if ($i === $j) {
$matrix[$i][$j] = 100.0;
} elseif (!isset($matrix[$j][$i])) {
similar_text($str1, $str2, $percent);
$matrix[$i][$j] = round($percent, 2);
} else {
$matrix[$i][$j] = $matrix[$j][$i];
}
}
}
return $matrix;
}
}
// 使用例
echo "=== 類似度計算 ===\n";
$result = SimilarityChecker::calculate("Hello World", "Hello PHP");
print_r($result);
echo "\n=== 複数比較 ===\n";
$target = "apple";
$candidates = ["aple", "apples", "application", "banana"];
$results = SimilarityChecker::compareMultiple($target, $candidates);
foreach ($results as $result) {
echo "{$result['candidate']}: {$result['percent']}% ({$result['rating']})\n";
}
echo "\n=== 最も類似した文字列 ===\n";
$mostSimilar = SimilarityChecker::findMostSimilar("programming",
["programing", "programmer", "program", "coding"]);
echo "最も類似: {$mostSimilar}\n";
echo "\n=== 類似度マトリックス ===\n";
$strings = ["cat", "bat", "rat", "car"];
$matrix = SimilarityChecker::createMatrix($strings);
foreach ($matrix as $i => $row) {
echo $strings[$i] . ": ";
foreach ($row as $j => $percent) {
echo sprintf("%6.2f ", $percent);
}
echo "\n";
}
例2: スペルチェッカー
class SpellChecker {
private $dictionary = [];
public function __construct($words) {
$this->dictionary = array_map('strtolower', $words);
}
/**
* スペルをチェック
*/
public function check($word) {
$word = strtolower($word);
if (in_array($word, $this->dictionary)) {
return [
'correct' => true,
'word' => $word,
'suggestions' => []
];
}
return [
'correct' => false,
'word' => $word,
'suggestions' => $this->getSuggestions($word)
];
}
/**
* 候補を取得
*/
public function getSuggestions($word, $maxSuggestions = 5, $minSimilarity = 50) {
$word = strtolower($word);
$suggestions = [];
foreach ($this->dictionary as $dictWord) {
similar_text($word, $dictWord, $percent);
if ($percent >= $minSimilarity) {
$suggestions[] = [
'word' => $dictWord,
'similarity' => round($percent, 2)
];
}
}
// 類似度でソート
usort($suggestions, function($a, $b) {
return $b['similarity'] <=> $a['similarity'];
});
return array_slice($suggestions, 0, $maxSuggestions);
}
/**
* テキスト全体をチェック
*/
public function checkText($text) {
$words = preg_split('/\s+/', $text);
$results = [];
foreach ($words as $word) {
$cleanWord = preg_replace('/[^\w]/', '', $word);
if (!empty($cleanWord)) {
$results[] = $this->check($cleanWord);
}
}
return $results;
}
/**
* エラーのみを抽出
*/
public function getErrors($text) {
$results = $this->checkText($text);
return array_filter($results, function($result) {
return !$result['correct'];
});
}
}
// 使用例
$dictionary = [
'apple', 'banana', 'orange', 'grape',
'programming', 'developer', 'application',
'computer', 'software', 'hardware'
];
$checker = new SpellChecker($dictionary);
echo "=== スペルチェック ===\n";
$word = "aple";
$result = $checker->check($word);
if (!$result['correct']) {
echo "「{$word}」のスペルが間違っています。\n";
echo "候補:\n";
foreach ($result['suggestions'] as $suggestion) {
echo " - {$suggestion['word']} ({$suggestion['similarity']}%)\n";
}
}
echo "\n=== テキストチェック ===\n";
$text = "I like aples and banannas. Programing is fun.";
$errors = $checker->getErrors($text);
foreach ($errors as $error) {
echo "エラー: {$error['word']}\n";
if (!empty($error['suggestions'])) {
$topSuggestion = $error['suggestions'][0];
echo " もしかして: {$topSuggestion['word']}?\n";
}
}
例3: 重複検出システム
class DuplicateDetector {
/**
* 重複を検出
*/
public static function findDuplicates($items, $threshold = 80) {
$duplicates = [];
$count = count($items);
for ($i = 0; $i < $count; $i++) {
for ($j = $i + 1; $j < $count; $j++) {
similar_text(
strtolower($items[$i]),
strtolower($items[$j]),
$percent
);
if ($percent >= $threshold) {
$duplicates[] = [
'item1' => $items[$i],
'item2' => $items[$j],
'similarity' => round($percent, 2)
];
}
}
}
return $duplicates;
}
/**
* グループ化
*/
public static function groupSimilar($items, $threshold = 80) {
$groups = [];
$processed = [];
foreach ($items as $i => $item1) {
if (in_array($i, $processed)) {
continue;
}
$group = [$item1];
$processed[] = $i;
foreach ($items as $j => $item2) {
if ($i === $j || in_array($j, $processed)) {
continue;
}
similar_text(
strtolower($item1),
strtolower($item2),
$percent
);
if ($percent >= $threshold) {
$group[] = $item2;
$processed[] = $j;
}
}
if (count($group) > 1) {
$groups[] = $group;
}
}
return $groups;
}
/**
* 重複レポートを生成
*/
public static function generateReport($items, $threshold = 80) {
$duplicates = self::findDuplicates($items, $threshold);
return [
'total_items' => count($items),
'duplicate_pairs' => count($duplicates),
'duplicates' => $duplicates,
'unique_items' => count($items) - count($duplicates)
];
}
}
// 使用例
$products = [
"Apple iPhone 13 Pro",
"Apple iPhone 13 Pro Max",
"Samsung Galaxy S21",
"Samsung Galaxy S21 Ultra",
"Apple iPhone 13",
"Sony PlayStation 5"
];
echo "=== 重複検出 ===\n";
$duplicates = DuplicateDetector::findDuplicates($products, 70);
foreach ($duplicates as $dup) {
echo "{$dup['item1']}\n";
echo " ↔ {$dup['item2']}\n";
echo " 類似度: {$dup['similarity']}%\n\n";
}
echo "=== グループ化 ===\n";
$groups = DuplicateDetector::groupSimilar($products, 70);
foreach ($groups as $i => $group) {
echo "グループ " . ($i + 1) . ":\n";
foreach ($group as $item) {
echo " - {$item}\n";
}
echo "\n";
}
echo "=== レポート ===\n";
$report = DuplicateDetector::generateReport($products, 70);
print_r($report);
例4: 検索システム
class FuzzySearchEngine {
private $data = [];
/**
* データを追加
*/
public function addItem($id, $title, $description = '') {
$this->data[$id] = [
'id' => $id,
'title' => $title,
'description' => $description
];
}
/**
* 曖昧検索
*/
public function search($query, $minSimilarity = 50, $maxResults = 10) {
$results = [];
$query = strtolower($query);
foreach ($this->data as $item) {
$titleLower = strtolower($item['title']);
$descLower = strtolower($item['description']);
// タイトルとの類似度
similar_text($query, $titleLower, $titlePercent);
// 説明文との類似度
similar_text($query, $descLower, $descPercent);
// 高い方を採用
$similarity = max($titlePercent, $descPercent);
if ($similarity >= $minSimilarity) {
$results[] = [
'item' => $item,
'similarity' => round($similarity, 2),
'title_match' => round($titlePercent, 2),
'desc_match' => round($descPercent, 2)
];
}
}
// 類似度でソート
usort($results, function($a, $b) {
return $b['similarity'] <=> $a['similarity'];
});
return array_slice($results, 0, $maxResults);
}
/**
* タイトルのみで検索
*/
public function searchTitle($query, $minSimilarity = 50) {
$results = [];
$query = strtolower($query);
foreach ($this->data as $item) {
$titleLower = strtolower($item['title']);
similar_text($query, $titleLower, $percent);
if ($percent >= $minSimilarity) {
$results[] = [
'item' => $item,
'similarity' => round($percent, 2)
];
}
}
usort($results, function($a, $b) {
return $b['similarity'] <=> $a['similarity'];
});
return $results;
}
/**
* 関連アイテムを取得
*/
public function findRelated($itemId, $maxResults = 5) {
if (!isset($this->data[$itemId])) {
return [];
}
$targetItem = $this->data[$itemId];
$results = [];
foreach ($this->data as $id => $item) {
if ($id === $itemId) {
continue;
}
similar_text(
strtolower($targetItem['title']),
strtolower($item['title']),
$percent
);
$results[] = [
'item' => $item,
'similarity' => round($percent, 2)
];
}
usort($results, function($a, $b) {
return $b['similarity'] <=> $a['similarity'];
});
return array_slice($results, 0, $maxResults);
}
}
// 使用例
$engine = new FuzzySearchEngine();
$engine->addItem(1, 'PHP Programming Guide', 'Learn PHP from basics to advanced');
$engine->addItem(2, 'Python Programming Tutorial', 'Python for beginners');
$engine->addItem(3, 'PHP Web Development', 'Build websites with PHP');
$engine->addItem(4, 'JavaScript Essentials', 'Master JavaScript');
$engine->addItem(5, 'PHP Advanced Techniques', 'Advanced PHP programming');
echo "=== 曖昧検索 ===\n";
$query = "PHP programing"; // スペルミス
$results = $engine->search($query, 40);
foreach ($results as $result) {
echo "{$result['item']['title']}\n";
echo " 類似度: {$result['similarity']}%\n";
echo " (タイトル: {$result['title_match']}%, 説明: {$result['desc_match']}%)\n\n";
}
echo "\n=== 関連アイテム ===\n";
$related = $engine->findRelated(1, 3);
echo "「PHP Programming Guide」の関連アイテム:\n";
foreach ($related as $item) {
echo " - {$item['item']['title']} ({$item['similarity']}%)\n";
}
例5: プレイグチェック(盗作検出)
class PlagiarismChecker {
/**
* テキストを文に分割
*/
private static function splitSentences($text) {
return preg_split('/[.!?]+/', $text, -1, PREG_SPLIT_NO_EMPTY);
}
/**
* 2つのテキストを比較
*/
public static function compare($text1, $text2) {
similar_text(
strtolower($text1),
strtolower($text2),
$percent
);
return [
'overall_similarity' => round($percent, 2),
'rating' => self::getRating($percent)
];
}
/**
* 文単位で比較
*/
public static function compareDetailed($text1, $text2, $threshold = 70) {
$sentences1 = self::splitSentences($text1);
$sentences2 = self::splitSentences($text2);
$matches = [];
foreach ($sentences1 as $i => $sent1) {
foreach ($sentences2 as $j => $sent2) {
similar_text(
strtolower(trim($sent1)),
strtolower(trim($sent2)),
$percent
);
if ($percent >= $threshold) {
$matches[] = [
'sentence1' => trim($sent1),
'sentence2' => trim($sent2),
'similarity' => round($percent, 2)
];
}
}
}
return $matches;
}
/**
* 評価を取得
*/
private static function getRating($percent) {
if ($percent >= 80) return '高い類似度(盗作の可能性大)';
if ($percent >= 60) return '中程度の類似度(要確認)';
if ($percent >= 40) return '低い類似度';
return '類似度なし';
}
/**
* レポート生成
*/
public static function generateReport($text1, $text2) {
$overall = self::compare($text1, $text2);
$detailed = self::compareDetailed($text1, $text2);
return [
'overall' => $overall,
'matching_sentences' => count($detailed),
'matches' => $detailed
];
}
}
// 使用例
$original = "PHP is a popular programming language. It is widely used for web development. Many developers love PHP.";
$suspected = "PHP is a well-known programming language. It is commonly used for building websites. Lots of programmers enjoy PHP.";
echo "=== プレイグチェック ===\n";
$result = PlagiarismChecker::compare($original, $suspected);
echo "全体の類似度: {$result['overall_similarity']}%\n";
echo "評価: {$result['rating']}\n";
echo "\n=== 詳細比較 ===\n";
$matches = PlagiarismChecker::compareDetailed($original, $suspected, 60);
foreach ($matches as $match) {
echo "類似度 {$match['similarity']}%:\n";
echo " 元: {$match['sentence1']}\n";
echo " 比較: {$match['sentence2']}\n\n";
}
echo "=== レポート ===\n";
$report = PlagiarismChecker::generateReport($original, $suspected);
print_r($report);
例6: テキスト推薦システム
class RecommendationEngine {
private $items = [];
/**
* アイテムを追加
*/
public function addItem($id, $tags, $description) {
$this->items[$id] = [
'id' => $id,
'tags' => is_array($tags) ? implode(' ', $tags) : $tags,
'description' => $description
];
}
/**
* 類似アイテムを推薦
*/
public function recommend($itemId, $maxRecommendations = 5) {
if (!isset($this->items[$itemId])) {
return [];
}
$targetItem = $this->items[$itemId];
$recommendations = [];
foreach ($this->items as $id => $item) {
if ($id === $itemId) {
continue;
}
// タグの類似度
similar_text(
strtolower($targetItem['tags']),
strtolower($item['tags']),
$tagPercent
);
// 説明文の類似度
similar_text(
strtolower($targetItem['description']),
strtolower($item['description']),
$descPercent
);
// 重み付け平均
$score = ($tagPercent * 0.7) + ($descPercent * 0.3);
$recommendations[] = [
'id' => $id,
'score' => round($score, 2),
'tag_similarity' => round($tagPercent, 2),
'desc_similarity' => round($descPercent, 2)
];
}
// スコアでソート
usort($recommendations, function($a, $b) {
return $b['score'] <=> $a['score'];
});
return array_slice($recommendations, 0, $maxRecommendations);
}
/**
* ユーザープロファイルに基づく推薦
*/
public function recommendByProfile($userInterests, $maxRecommendations = 5) {
$interestsText = is_array($userInterests) ?
implode(' ', $userInterests) : $userInterests;
$recommendations = [];
foreach ($this->items as $id => $item) {
$combinedText = $item['tags'] . ' ' . $item['description'];
similar_text(
strtolower($interestsText),
strtolower($combinedText),
$percent
);
$recommendations[] = [
'id' => $id,
'score' => round($percent, 2)
];
}
usort($recommendations, function($a, $b) {
return $b['score'] <=> $a['score'];
});
return array_slice($recommendations, 0, $maxRecommendations);
}
}
// 使用例
$engine = new RecommendationEngine();
$engine->addItem(1,
['PHP', 'Programming', 'Web'],
'Learn PHP web development'
);
$engine->addItem(2,
['Python', 'Programming', 'Data'],
'Python for data science'
);
$engine->addItem(3,
['PHP', 'Laravel', 'Framework'],
'Build apps with Laravel'
);
$engine->addItem(4,
['JavaScript', 'Web', 'Frontend'],
'JavaScript web development'
);
echo "=== アイテム推薦 ===\n";
$recommendations = $engine->recommend(1, 3);
foreach ($recommendations as $rec) {
echo "アイテム {$rec['id']}: スコア {$rec['score']}%\n";
echo " (タグ: {$rec['tag_similarity']}%, 説明: {$rec['desc_similarity']}%)\n";
}
echo "\n=== プロファイルベース推薦 ===\n";
$userInterests = ['PHP', 'Web Development', 'Backend'];
$recommendations = $engine->recommendByProfile($userInterests, 3);
foreach ($recommendations as $rec) {
echo "アイテム {$rec['id']}: スコア {$rec['score']}%\n";
}
例7: 比較ツール
class TextComparator {
/**
* 双方向の類似度を計算
*/
public static function bidirectionalSimilarity($str1, $str2) {
similar_text($str1, $str2, $percent1);
similar_text($str2, $str1, $percent2);
return [
'str1_to_str2' => round($percent1, 2),
'str2_to_str1' => round($percent2, 2),
'average' => round(($percent1 + $percent2) / 2, 2),
'symmetric' => abs($percent1 - $percent2) < 0.01
];
}
/**
* 複数の文字列を総当たり比較
*/
public static function compareAll($strings) {
$results = [];
$count = count($strings);
for ($i = 0; $i < $count; $i++) {
for ($j = $i + 1; $j < $count; $j++) {
similar_text($strings[$i], $strings[$j], $percent);
$results[] = [
'string1' => $strings[$i],
'string2' => $strings[$j],
'similarity' => round($percent, 2)
];
}
}
// 類似度でソート
usort($results, function($a, $b) {
return $b['similarity'] <=> $a['similarity'];
});
return $results;
}
/**
* 統計情報を取得
*/
public static function getStatistics($comparisons) {
if (empty($comparisons)) {
return [];
}
$similarities = array_column($comparisons, 'similarity');
return [
'count' => count($comparisons),
'min' => min($similarities),
'max' => max($similarities),
'average' => round(array_sum($similarities) / count($similarities), 2),
'median' => self::median($similarities)
];
}
/**
* 中央値を計算
*/
private static function median($values) {
sort($values);
$count = count($values);
$middle = floor($count / 2);
if ($count % 2 === 0) {
return ($values[$middle - 1] + $values[$middle]) / 2;
}
return $values[$middle];
}
}
// 使用例
echo "=== 双方向類似度 ===\n";
$result = TextComparator::bidirectionalSimilarity("Hello World", "Hello PHP");
print_r($result);
echo "\n=== 総当たり比較 ===\n";
$strings = ["apple", "aple", "apples", "application", "banana"];
$comparisons = TextComparator::compareAll($strings);
foreach (array_slice($comparisons, 0, 5) as $comp) {
echo "{$comp['string1']} ↔ {$comp['string2']}: {$comp['similarity']}%\n";
}
echo "\n=== 統計情報 ===\n";
$stats = TextComparator::getStatistics($comparisons);
print_r($stats);
levenshtein()との比較
$str1 = "kitten";
$str2 = "sitting";
// similar_text(): 類似度(パーセント)
similar_text($str1, $str2, $percent);
echo "similar_text: {$percent}%\n";
// 出力: 57.14%
// levenshtein(): 編集距離(変更回数)
$distance = levenshtein($str1, $str2);
echo "levenshtein: {$distance}\n";
// 出力: 3(3回の変更が必要)
// 用途の違い:
// similar_text: 全体的な類似度を知りたい時
// levenshtein: タイポの程度を知りたい時
パフォーマンス上の注意
// 長い文字列では遅くなる
$long1 = str_repeat("a", 10000);
$long2 = str_repeat("a", 10000);
$start = microtime(true);
similar_text($long1, $long2, $percent);
$time = microtime(true) - $start;
echo "処理時間: {$time}秒\n";
echo "類似度: {$percent}%\n";
// similar_text()はO(N^3)の計算量
// 長い文字列には注意が必要
まとめ
similar_text()関数の特徴をまとめると:
できること:
- 文字列の類似度を計算
- パーセンテージで表現
- 一致する文字数も取得
推奨される使用場面:
- スペルチェック
- 重複検出
- 曖昧検索
- 盗作検出
- 推薦システム
- テキスト比較
利点:
- 直感的な類似度(パーセント)
- 実装が簡単
- 部分一致に強い
注意点:
- 計算量が大きい(O(N^3))
- 大文字小文字を区別
- 順序が結果に影響
- 長い文字列では遅い
関連関数:
levenshtein(): 編集距離を計算soundex(): 発音ベースの比較metaphone(): より精度の高い発音比較
使い分け:
// similar_text(): 全体的な類似度
similar_text($str1, $str2, $percent);
// levenshtein(): タイポ・編集距離
$distance = levenshtein($str1, $str2);
// soundex(): 発音の類似性
$same = soundex($str1) === soundex($str2);
similar_text()は、文字列の類似度を直感的に理解しやすい形で提供してくれる便利な関数です。スペルチェックや重複検出など、様々な場面で活躍します。ただし、長い文字列では処理が遅くなる点に注意しましょう!
