diff --git a/AUTHORS.rst b/AUTHORS.rst index ff92ab7eab7..0d177a8c05b 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -107,6 +107,7 @@ Contributors * Thomas Lamb -- linkcheck builder * Thomas Waldmann -- apidoc module fixes * Tim Hoffmann -- theme improvements +* Tokuhiro Matsuno -- search unicode normalization * Vince Salvino -- JavaScript search improvements * Will Maier -- directory HTML builder * Zac Hatfield-Dodds -- doctest reporting improvements, intersphinx performance diff --git a/CHANGES.rst b/CHANGES.rst index c257b3b11b1..141ace750d0 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -13,6 +13,12 @@ Deprecated Features added -------------- +* #13384: Add Unicode normalization option for search indexing. + This allows users to specify the type of Unicode normalization + (NFC, NFD, NFKC, NFKD) to apply during searches, improving the + accuracy and reliability of search results. + Patch by Tokuhiro Matsuno. + Bugs fixed ---------- diff --git a/doc/usage/configuration.rst b/doc/usage/configuration.rst index 75e08d7654b..08abf205f75 100644 --- a/doc/usage/configuration.rst +++ b/doc/usage/configuration.rst @@ -2030,6 +2030,26 @@ and also make use of these options. .. versionadded:: 1.0 +.. confval:: html_search_unicode_normalization + :type: :code-py:`str` + :default: :code-py:`"NFKD"` + + html_search_unicode_normalization is a setting that specifies the type + of Unicode normalization to apply during searches. It can take one of + the following values: + + * **None** -- Disable the Unicode normalization. + * **"NFD"** -- Decomposes characters into their canonical decomposed form. + * **"NFC"** -- Composes characters into their canonical composed form. + * **"NFKD"** -- Decomposes characters into their compatibility decomposed form. + * **"NFKC"** -- Composes characters into their compatibility composed form. + + This setting ensures that text is consistently normalized, improving the + accuracy and reliability of search results by handling different Unicode + representations of the same characters. + + .. versionadded:: 8.3 + .. confval:: html_search_language :type: :code-py:`str` :default: The value of **language** diff --git a/sphinx/builders/html/__init__.py b/sphinx/builders/html/__init__.py index 5e6acdeaf9d..91a5d42758e 100644 --- a/sphinx/builders/html/__init__.py +++ b/sphinx/builders/html/__init__.py @@ -440,6 +440,7 @@ def prepare_writing(self, docnames: Set[str]) -> None: lang, self.config.html_search_options, self.config.html_search_scorer, + self.config.html_search_unicode_normalization, ) self.load_indexer(docnames) @@ -544,6 +545,7 @@ def prepare_writing(self, docnames: Set[str]) -> None: 'has_source': self.config.html_copy_source, 'show_source': self.config.html_show_sourcelink, 'sourcelink_suffix': self.config.html_sourcelink_suffix, + 'search_unicode_normalization': self.config.html_search_unicode_normalization, 'file_suffix': self.out_suffix, 'link_suffix': self.link_suffix, 'script_files': self._js_files, @@ -1490,6 +1492,9 @@ def setup(app: Sphinx) -> ExtensionMetadata: app.add_config_value( 'html_show_search_summary', True, 'html', types=frozenset({bool}) ) + app.add_config_value( + 'html_search_unicode_normalization', 'NFKD', 'html', types=frozenset({str}) + ) app.add_config_value('html_show_sphinx', True, 'html', types=frozenset({bool})) app.add_config_value('html_context', {}, 'html', types=frozenset({dict})) app.add_config_value( diff --git a/sphinx/search/__init__.py b/sphinx/search/__init__.py index cd0aa0bbd8f..5f99c86d931 100644 --- a/sphinx/search/__init__.py +++ b/sphinx/search/__init__.py @@ -9,6 +9,7 @@ import os import pickle import re +import unicodedata from importlib import import_module from typing import TYPE_CHECKING @@ -21,7 +22,7 @@ if TYPE_CHECKING: from collections.abc import Callable, Iterable - from typing import Any, Protocol, TypeVar + from typing import Any, Literal, Protocol, TypeVar from docutils.nodes import Node @@ -275,7 +276,12 @@ class IndexBuilder: } def __init__( - self, env: BuildEnvironment, lang: str, options: dict[str, str], scoring: str + self, + env: BuildEnvironment, + lang: str, + options: dict[str, str], + scoring: str, + normalization: Literal['NFC', 'NFKC', 'NFD', 'NFKD'] | None = None, ) -> None: self._domains = env.domains self._env_version = env.version @@ -301,6 +307,7 @@ def __init__( self._objnames: dict[int, tuple[str, str, str]] = env._search_index_objnames # add language-specific SearchLanguage instance lang_class = languages.get(lang) + self._unicode_normalization = normalization # fallback; try again with language-code if lang_class is None and '_' in lang: @@ -552,7 +559,11 @@ def _word_collector(self, doctree: nodes.document) -> WordStore: split = self.lang.split language = self.lang.lang _feed_visit_nodes( - doctree, word_store=word_store, split=split, language=language + doctree, + word_store=word_store, + split=split, + language=language, + normalization=self._unicode_normalization, ) return word_store @@ -602,7 +613,14 @@ def _feed_visit_nodes( word_store: WordStore, split: Callable[[str], list[str]], language: str, + normalization: Literal['NFC', 'NFKC', 'NFD', 'NFKD'] | None, ) -> None: + def normalize(text: str) -> str: + if normalization: + return unicodedata.normalize(normalization, text) + else: + return text + if isinstance(node, nodes.comment): return elif isinstance(node, nodes.Element) and 'no-search' in node['classes']: @@ -626,18 +644,26 @@ def _feed_visit_nodes( flags=re.IGNORECASE | re.DOTALL, ) nodetext = re.sub(r'<[^<]+?>', '', nodetext) - word_store.words.extend(split(nodetext)) + word_store.words.extend(split(normalize(nodetext))) return elif isinstance(node, nodes.meta) and _is_meta_keywords(node, language): - keywords = [keyword.strip() for keyword in node['content'].split(',')] + keywords = [ + normalize(keyword.strip()) for keyword in node['content'].split(',') + ] word_store.words.extend(keywords) elif isinstance(node, nodes.Text): - word_store.words.extend(split(node.astext())) + word_store.words.extend(split(normalize(node.astext()))) elif isinstance(node, nodes.title): title, is_main_title = node.astext(), len(word_store.titles) == 0 ids = node.parent['ids'] title_node_id = None if is_main_title else ids[0] if ids else None - word_store.titles.append((title, title_node_id)) - word_store.title_words.extend(split(title)) + word_store.titles.append((normalize(title), title_node_id)) + word_store.title_words.extend(split(normalize(title))) for child in node.children: - _feed_visit_nodes(child, word_store=word_store, split=split, language=language) + _feed_visit_nodes( + child, + word_store=word_store, + split=split, + language=language, + normalization=normalization, + ) diff --git a/sphinx/themes/basic/static/documentation_options.js.jinja b/sphinx/themes/basic/static/documentation_options.js.jinja index b66690abd19..0fb300eb755 100644 --- a/sphinx/themes/basic/static/documentation_options.js.jinja +++ b/sphinx/themes/basic/static/documentation_options.js.jinja @@ -10,4 +10,5 @@ const DOCUMENTATION_OPTIONS = { NAVIGATION_WITH_KEYS: {{ 'true' if theme_navigation_with_keys|tobool else 'false'}}, SHOW_SEARCH_SUMMARY: {{ 'true' if show_search_summary else 'false' }}, ENABLE_SEARCH_SHORTCUTS: {{ 'true' if theme_enable_search_shortcuts|tobool else 'false'}}, + SEARCH_UNICODE_NORMALIZATION: {{ '"' + search_unicode_normalization + '"' if search_unicode_normalization else 'null' }}, }; diff --git a/sphinx/themes/basic/static/searchtools.js b/sphinx/themes/basic/static/searchtools.js index 91f4be57fc8..7e6da88762d 100644 --- a/sphinx/themes/basic/static/searchtools.js +++ b/sphinx/themes/basic/static/searchtools.js @@ -276,7 +276,15 @@ const Search = { else Search.deferQuery(query); }, + _normalizeQuery: (query, form) => { + return query.normalize(form); + }, + _parseQuery: (query) => { + if (DOCUMENTATION_OPTIONS.SEARCH_UNICODE_NORMALIZATION) { + query = Search._normalizeQuery(query, DOCUMENTATION_OPTIONS.SEARCH_UNICODE_NORMALIZATION); + } + // stem the search terms and add them to the correct list const stemmer = new Stemmer(); const searchTerms = new Set(); diff --git a/tests/js/fixtures/normalization/searchindex.js b/tests/js/fixtures/normalization/searchindex.js new file mode 100644 index 00000000000..dbd1b645e76 --- /dev/null +++ b/tests/js/fixtures/normalization/searchindex.js @@ -0,0 +1 @@ +Search.setIndex({"alltitles":{"Sphinx":[[0,null]]},"docnames":["index"],"envversion":{"sphinx":65,"sphinx.domains.c":3,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":9,"sphinx.domains.index":1,"sphinx.domains.javascript":3,"sphinx.domains.math":2,"sphinx.domains.python":4,"sphinx.domains.rst":2,"sphinx.domains.std":2},"filenames":["index.rst"],"indexentries":{},"objects":{},"objnames":{},"objtypes":{},"terms":{"i":0,"main":0,"normal":0,"page":0,"project":0,"test":0,"thi":0},"titles":["Sphinx"],"titleterms":{"sphinx":0}}) \ No newline at end of file diff --git a/tests/js/roots/normalization/conf.py b/tests/js/roots/normalization/conf.py new file mode 100644 index 00000000000..f7e7b2e1919 --- /dev/null +++ b/tests/js/roots/normalization/conf.py @@ -0,0 +1 @@ +html_search_unicode_normalization = 'NFKC' diff --git a/tests/js/roots/normalization/index.rst b/tests/js/roots/normalization/index.rst new file mode 100644 index 00000000000..d9534af0eee --- /dev/null +++ b/tests/js/roots/normalization/index.rst @@ -0,0 +1,5 @@ +Sphinx +====== + +This is the main page of the ``normalization`` test project. + diff --git a/tests/js/searchtools.spec.js b/tests/js/searchtools.spec.js index 809fd19d0f4..ee5d982ffbc 100644 --- a/tests/js/searchtools.spec.js +++ b/tests/js/searchtools.spec.js @@ -95,7 +95,44 @@ describe('Basic html theme search', function() { ]]; expect(Search.performTermsSearch(searchterms, excluded, terms, titleterms)).toEqual(hits); }); + }); + + describe('unicode normalization', function() { + it('should find documents indexed with half-width characters using a full-width query', function() { + DOCUMENTATION_OPTIONS.SEARCH_UNICODE_NORMALIZATION = 'NFKC'; + + eval(loadFixture("normalization/searchindex.js")); + + [_searchQuery, searchterms, excluded, ..._remainingItems] = Search._parseQuery('Sphinx'); + + terms = Search._index.terms; + titleterms = Search._index.titleterms; + + hits = [[ + "index", + "Sphinx", + "", + null, + 15, + "index.rst", + "text"], + ]; + + expect(Search.performTermsSearch(searchterms, excluded, terms, titleterms)).toEqual(hits); + }); + it('should parse queries with half-width and full-width characters equivalently', function() { + const halfWidthQuery = Search._normalizeQuery('Sphinx', 'NFKC'); + const fullWidthQuery = Search._normalizeQuery('Sphinx', 'NFKC'); + + expect(halfWidthQuery).toEqual(fullWidthQuery); + }); + + afterEach(() => { + Object.keys(DOCUMENTATION_OPTIONS).forEach(key => { + delete DOCUMENTATION_OPTIONS[key]; + }); + }); }); describe('aggregation of search results', function() { diff --git a/tests/roots/test-search/tocitem.rst b/tests/roots/test-search/tocitem.rst index 98a1dc7ed9f..69a723077d7 100644 --- a/tests/roots/test-search/tocitem.rst +++ b/tests/roots/test-search/tocitem.rst @@ -15,3 +15,5 @@ lorem ipsum 模块中 CAS service部分 可以Chinesetesttwo查看 + +Python diff --git a/tests/test_search.py b/tests/test_search.py index 22fa6ab7616..b30d79f03bc 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -484,3 +484,16 @@ def test_check_js_search_indexes(make_app, sphinx_test_tempdir, directory): f'Search index fixture {existing_searchindex} does not match regenerated copy.' ) assert fresh_searchindex.read_bytes() == existing_searchindex.read_bytes(), msg + + +@pytest.mark.sphinx( + 'html', + testroot='search', + confoverrides={'html_search_unicode_normalization': 'NFKC'}, + srcdir='search_normalize', +) +def test_search_index_unicode_normalize(app: SphinxTestApp) -> None: + app.build(force_all=True) + index = load_searchindex(app.outdir / 'searchindex.js') + assert 'Python' not in index['terms'] + assert 'python' in index['terms']