Problem/Motivation
Japanese, Chinese, and Korean do not have a space separator for each word, so each character is divided and indexed separately. For example, the sentence "こんにちは" is divided into "こんに", "んにち", "にちわ", and so on.
We can set the minimum number of characters to be indexed. For example, if you set it to 3 characters (default), the sentence "こんにちは" will be divided into "こんに", "んにち", and "にちわ".
On the other hand, if set to 2 characters, the words would be "こん", "んに", "にち", and "ちわ".
When searching for a word with more than this minimum word, the wrong query will be executed.
Environment
- Drupal version: 9.2.14-dev
- language: Japanese
- Apache version: 2.4.52
- mysql version: 5.7.29
- PHP version: 7.4.28
Steps to reproduce
Create content with the following titles.
- "東京から京都へ行きます"
- "京都府から富士山が見えました"
Normal search
- Search word: "東京都"
- Result: "東京から京都へ行きます" appears in the search results.
- Expected results: Search results do not exist.
Full text
SELECT i.langcode AS langcode, i.type AS type, i.sid AS sid, SUM(((ROUND('1.5585881099241', 4)) * i.score * t.count)) AS calculated_score
FROM
search_index i
INNER JOIN node_field_data n ON n.nid = i.sid AND n.langcode = i.langcode
INNER JOIN search_total t ON i.word = t.word
INNER JOIN search_dataset d ON i.sid = d.sid AND i.type = d.type AND i.langcode = d.langcode
WHERE (n.status = '1') AND ((i.word = '東京') OR (i.word = '京都')) AND (i.type = 'node_search') AND ((d.data LIKE '% 東京 %' ESCAPE '\\') AND (d.data LIKE '% 京都 %' ESCAPE '\\'))
GROUP BY i.langcode, i.type, i.sid
HAVING (COUNT(*) >= '2')
ORDER BY calculated_score DESC
LIMIT 10 OFFSET 0
- Search word: "東京都"
- Result: "東京から京都へ行きます" appears in the search results.
- Expected results: Search results do not exist.
Full text
SELECT i.langcode AS langcode, i.type AS type, i.sid AS sid, SUM(((ROUND('1.5585881099241', 4)) * i.score * t.count)) AS calculated_score
FROM
search_index i
INNER JOIN node_field_data n ON n.nid = i.sid AND n.langcode = i.langcode
INNER JOIN search_total t ON i.word = t.word
INNER JOIN search_dataset d ON i.sid = d.sid AND i.type = d.type AND i.langcode = d.langcode
WHERE (n.status = '1') AND ((i.word = '東京') OR (i.word = '京都')) AND (i.type = 'node_search') AND ((d.data LIKE '% 東京 %' ESCAPE '\\') AND (d.data LIKE '% 京都 %' ESCAPE '\\'))
GROUP BY i.langcode, i.type, i.sid
HAVING (COUNT(*) >= '2')
ORDER BY calculated_score DESC
LIMIT 10 OFFSET 0
WHERE clause
WHERE (n.status = '1')
AND ((i.word = '東京') OR (i.word = '京都'))
AND (i.type = 'node_search')
AND ((d.data LIKE '% 東京 %' ESCAPE '\\')
AND (d.data LIKE '% 京都 %' ESCAPE '\\'))
Expected query
WHERE (n.status = '1')
AND ((i.word = '東京') OR (i.word = '京都'))
AND (i.type = 'node_search')
AND ((d.data LIKE '% 東京 京都 %' ESCAPE ‘\\’))
OR Search
- Search words: "富士山 OR 京都"
- Result: Only "京都府から富士山が見えました" is displayed in the search results, and "東京から京都へ行きます" is not displayed.
- Expected results: Both "京都府から富士山が見えました" and "東京から京都へ行きます" are displayed.
- I think so following query is wrong.
Full text
SELECT i.langcode AS langcode, i.type AS type, i.sid AS sid, SUM(((ROUND('0.93628438291873', 4)) * i.score * t.count)) AS calculated_score
FROM
search_index i
INNER JOIN node_field_data n ON n.nid = i.sid AND n.langcode = i.langcode
INNER JOIN search_total t ON i.word = t.word
INNER JOIN search_dataset d ON i.sid = d.sid AND i.type = d.type AND i.langcode = d.langcode
WHERE (n.status = '1') AND ((i.word = '富士') OR (i.word = '士山') OR (i.word = '京都')) AND (i.type = 'node_search') AND ((d.data LIKE '% 富士 %' ESCAPE '\\') AND ((d.data LIKE '% 士山 %' ESCAPE '\\') OR (d.data LIKE '% 京都 %' ESCAPE '\\')))
GROUP BY i.langcode, i.type, i.sid
ORDER BY calculated_score DESC
LIMIT 10 OFFSET 0
WHERE clause
WHERE (n.status = '1')
AND ((i.word = '富士') OR (i.word = '士山') OR (i.word = '京都'))
AND (i.type = 'node_search')
AND ((d.data LIKE '% 富士 %' ESCAPE '\\') AND ((d.data LIKE '% 士山 %' ESCAPE '\\') OR (d.data LIKE '% 京都 %' ESCAPE '\\')))
Expected query
WHERE (n.status = '1')
AND ((i.word = '富士') OR (i.word = '士山') OR (i.word = '京都'))
AND (i.type = 'node_search')
AND ((d.data LIKE '% 富士 %' ESCAPE '\\') AND ((d.data LIKE '% 士山 %' ESCAPE ‘\\’)) OR (d.data LIKE '% 京都 %' ESCAPE '\\'))
NOT Search
- Search word: "富士山 -東京都"
- Result: "京都府から富士山が見えました" does not appear in the search results.
- Expected result: "京都府から富士山が見えました" is displayed.
Full text
SELECT i.langcode AS langcode, SUM(i.score * t.count) AS calculated_score
FROM
search_index i
INNER JOIN node_field_data n ON n.nid = i.sid AND n.langcode = i.langcode
INNER JOIN search_total t ON i.word = t.word
INNER JOIN search_dataset d ON i.sid = d.sid AND i.type = d.type AND i.langcode = d.langcode
WHERE (n.status = '1') AND ((i.word = '富士') OR (i.word = '士山')) AND (i.type = 'node_search') AND ((d.data LIKE '% 富士 %' ESCAPE '\\') AND (d.data LIKE '% 士山 %' ESCAPE '\\') AND (d.data NOT LIKE '% 東京 %' ESCAPE '\\') AND (d.data NOT LIKE '% 京都 %' ESCAPE '\\'))
GROUP BY i.langcode, i.type, i.sid
ORDER BY calculated_score DESC
LIMIT 1 OFFSET 0
WHERE clause
WHERE (n.status = '1')
AND ((i.word = '富士') OR (i.word = '士山'))
AND (i.type = 'node_search')
AND ((d.data LIKE '% 富士 %' ESCAPE '\\')
AND (d.data LIKE '% 士山 %' ESCAPE '\\')
AND (d.data NOT LIKE '% 東京 %' ESCAPE '\\')
AND (d.data NOT LIKE '% 京都 %' ESCAPE '\\'))
Expected query
WHERE (n.status = '1')
AND ((i.word = '富士') OR (i.word = '士山'))
AND (i.type = 'node_search')
AND ((d.data LIKE '% 富士 %' ESCAPE '\\')
AND (d.data LIKE '% 士山 %' ESCAPE '\\')
AND ((d.data NOT LIKE '% 東京 %' ESCAPE '\\') OR (d.data NOT LIKE '% 京都 %' ESCAPE ‘\\’)))
- Search words: "富士山 OR 京都"
- Result: Only "京都府から富士山が見えました" is displayed in the search results, and "東京から京都へ行きます" is not displayed.
- Expected results: Both "京都府から富士山が見えました" and "東京から京都へ行きます" are displayed.
- I think so following query is wrong.
Full text
SELECT i.langcode AS langcode, i.type AS type, i.sid AS sid, SUM(((ROUND('0.93628438291873', 4)) * i.score * t.count)) AS calculated_score FROM search_index i INNER JOIN node_field_data n ON n.nid = i.sid AND n.langcode = i.langcode INNER JOIN search_total t ON i.word = t.word INNER JOIN search_dataset d ON i.sid = d.sid AND i.type = d.type AND i.langcode = d.langcode WHERE (n.status = '1') AND ((i.word = '富士') OR (i.word = '士山') OR (i.word = '京都')) AND (i.type = 'node_search') AND ((d.data LIKE '% 富士 %' ESCAPE '\\') AND ((d.data LIKE '% 士山 %' ESCAPE '\\') OR (d.data LIKE '% 京都 %' ESCAPE '\\'))) GROUP BY i.langcode, i.type, i.sid ORDER BY calculated_score DESC LIMIT 10 OFFSET 0
WHERE clause
WHERE (n.status = '1') AND ((i.word = '富士') OR (i.word = '士山') OR (i.word = '京都')) AND (i.type = 'node_search') AND ((d.data LIKE '% 富士 %' ESCAPE '\\') AND ((d.data LIKE '% 士山 %' ESCAPE '\\') OR (d.data LIKE '% 京都 %' ESCAPE '\\')))
Expected query
WHERE (n.status = '1') AND ((i.word = '富士') OR (i.word = '士山') OR (i.word = '京都')) AND (i.type = 'node_search') AND ((d.data LIKE '% 富士 %' ESCAPE '\\') AND ((d.data LIKE '% 士山 %' ESCAPE ‘\\’)) OR (d.data LIKE '% 京都 %' ESCAPE '\\'))
NOT Search
- Search word: "富士山 -東京都"
- Result: "京都府から富士山が見えました" does not appear in the search results.
- Expected result: "京都府から富士山が見えました" is displayed.
Full text
SELECT i.langcode AS langcode, SUM(i.score * t.count) AS calculated_score FROM search_index i INNER JOIN node_field_data n ON n.nid = i.sid AND n.langcode = i.langcode INNER JOIN search_total t ON i.word = t.word INNER JOIN search_dataset d ON i.sid = d.sid AND i.type = d.type AND i.langcode = d.langcode WHERE (n.status = '1') AND ((i.word = '富士') OR (i.word = '士山')) AND (i.type = 'node_search') AND ((d.data LIKE '% 富士 %' ESCAPE '\\') AND (d.data LIKE '% 士山 %' ESCAPE '\\') AND (d.data NOT LIKE '% 東京 %' ESCAPE '\\') AND (d.data NOT LIKE '% 京都 %' ESCAPE '\\')) GROUP BY i.langcode, i.type, i.sid ORDER BY calculated_score DESC LIMIT 1 OFFSET 0
WHERE clause
WHERE (n.status = '1') AND ((i.word = '富士') OR (i.word = '士山')) AND (i.type = 'node_search') AND ((d.data LIKE '% 富士 %' ESCAPE '\\') AND (d.data LIKE '% 士山 %' ESCAPE '\\') AND (d.data NOT LIKE '% 東京 %' ESCAPE '\\') AND (d.data NOT LIKE '% 京都 %' ESCAPE '\\'))
Expected query
WHERE (n.status = '1') AND ((i.word = '富士') OR (i.word = '士山')) AND (i.type = 'node_search') AND ((d.data LIKE '% 富士 %' ESCAPE '\\') AND (d.data LIKE '% 士山 %' ESCAPE '\\') AND ((d.data NOT LIKE '% 東京 %' ESCAPE '\\') OR (d.data NOT LIKE '% 京都 %' ESCAPE ‘\\’)))