From 38f37a3f8091f289a35a9e82ed2d79b8a55ccc0b Mon Sep 17 00:00:00 2001 From: Paul Cadman Date: Tue, 25 Mar 2014 16:14:18 +0000 Subject: [PATCH 1/5] Try to index a wikipage each time it is fetched so that search macros are updated after new pages are added. https://bugs.corefiling.com/show_bug.cgi?id=6940 --- .../impl/SearchIndexPopulatingPageStore.java | 21 ++++++++++++++++++- .../reviki/webtests/TestSearchMacro.java | 17 +++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/src/net/hillsdon/reviki/search/impl/SearchIndexPopulatingPageStore.java b/src/net/hillsdon/reviki/search/impl/SearchIndexPopulatingPageStore.java index 4a747b9b..65a09513 100644 --- a/src/net/hillsdon/reviki/search/impl/SearchIndexPopulatingPageStore.java +++ b/src/net/hillsdon/reviki/search/impl/SearchIndexPopulatingPageStore.java @@ -20,8 +20,10 @@ import net.hillsdon.reviki.search.SearchEngine; import net.hillsdon.reviki.vc.InterveningCommitException; import net.hillsdon.reviki.vc.PageInfo; +import net.hillsdon.reviki.vc.PageReference; import net.hillsdon.reviki.vc.PageStore; import net.hillsdon.reviki.vc.PageStoreException; +import net.hillsdon.reviki.vc.VersionedPageInfo; import net.hillsdon.reviki.vc.impl.SimpleDelegatingPageStore; import org.apache.commons.logging.Log; @@ -62,5 +64,22 @@ public long set(final PageInfo page, final String lockToken, final long baseRevi } return newRevision; } - + + @Override + public VersionedPageInfo get(PageReference ref, long revision) throws PageStoreException { + VersionedPageInfo page = super.get(ref, revision); + if (!page.isNewPage()) { + try { + if(!_indexer.isIndexBeingBuilt()) { + if(page.getContent().trim().length() > 0) { + _indexer.index(page, false); + } + } + } + catch (IOException e) { + LOG.error("Error adding to search index, skipping page: " + page, e); + } + } + return page; + } } diff --git a/webtests/net/hillsdon/reviki/webtests/TestSearchMacro.java b/webtests/net/hillsdon/reviki/webtests/TestSearchMacro.java index 36e55110..09d1ce32 100644 --- a/webtests/net/hillsdon/reviki/webtests/TestSearchMacro.java +++ b/webtests/net/hillsdon/reviki/webtests/TestSearchMacro.java @@ -90,4 +90,21 @@ public void testAttributeKeyWithColon() throws Exception { assertTrue(searchingOnPageAsText.contains(searchingFor)); assertSearchFindsPageUsingQuery(searchingOnPage, searchingFor, "path:" + searchingFor); } + + public void testBackLinksOnReferencedPage() throws Exception { + String refs = uniqueWikiPageName("Refs"); + String findMe = uniqueWikiPageName("FindMe"); + editWikiPage(refs, String.format("Macro: <>", findMe), "", "Search Macro Test", true); + + editWikiPage(findMe, "I'm here", "", "Search Macro Test", true); + + HtmlPage refsPage = getWikiPage(refs); + String refsPageAsText = refsPage.asText(); + assertTrue(refsPageAsText.contains(findMe)); + + //editWikiPage(refs, String.format("Macro: <>", findMe), "", "prompt", false); + HtmlPage findMePage = getWikiPage(findMe); + String findMePageAsText = findMePage.asText(); + assertTrue(findMePageAsText.contains(refs)); + } } From 9533318dd0b7e943c8a6a8344bfc00ae22a0e0de Mon Sep 17 00:00:00 2001 From: Paul Cadman Date: Tue, 25 Mar 2014 16:24:56 +0000 Subject: [PATCH 2/5] isIndexBeingBuilt() is already checked in the call to index, and there is no delete action in the case where the page content is empty, so we can simplify this get method. https://bugs.corefiling.com/show_bug.cgi?id=6940 --- .../reviki/search/impl/SearchIndexPopulatingPageStore.java | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/net/hillsdon/reviki/search/impl/SearchIndexPopulatingPageStore.java b/src/net/hillsdon/reviki/search/impl/SearchIndexPopulatingPageStore.java index 65a09513..fc0f9717 100644 --- a/src/net/hillsdon/reviki/search/impl/SearchIndexPopulatingPageStore.java +++ b/src/net/hillsdon/reviki/search/impl/SearchIndexPopulatingPageStore.java @@ -70,11 +70,7 @@ public VersionedPageInfo get(PageReference ref, long revision) throws PageStoreE VersionedPageInfo page = super.get(ref, revision); if (!page.isNewPage()) { try { - if(!_indexer.isIndexBeingBuilt()) { - if(page.getContent().trim().length() > 0) { - _indexer.index(page, false); - } - } + _indexer.index(page, false); } catch (IOException e) { LOG.error("Error adding to search index, skipping page: " + page, e); From 16f08f106a7bb6883c8893c2992646d2867406c1 Mon Sep 17 00:00:00 2001 From: Paul Cadman Date: Tue, 25 Mar 2014 17:15:04 +0000 Subject: [PATCH 3/5] Remove test debug line https://bugs.corefiling.com/show_bug.cgi?id=6940 --- webtests/net/hillsdon/reviki/webtests/TestSearchMacro.java | 1 - 1 file changed, 1 deletion(-) diff --git a/webtests/net/hillsdon/reviki/webtests/TestSearchMacro.java b/webtests/net/hillsdon/reviki/webtests/TestSearchMacro.java index 09d1ce32..8117072c 100644 --- a/webtests/net/hillsdon/reviki/webtests/TestSearchMacro.java +++ b/webtests/net/hillsdon/reviki/webtests/TestSearchMacro.java @@ -102,7 +102,6 @@ public void testBackLinksOnReferencedPage() throws Exception { String refsPageAsText = refsPage.asText(); assertTrue(refsPageAsText.contains(findMe)); - //editWikiPage(refs, String.format("Macro: <>", findMe), "", "prompt", false); HtmlPage findMePage = getWikiPage(findMe); String findMePageAsText = findMePage.asText(); assertTrue(findMePageAsText.contains(refs)); From 2ea8be8eb83ee9bc45ab2c905ef1c6b280e657b8 Mon Sep 17 00:00:00 2001 From: Paul Cadman Date: Tue, 25 Mar 2014 18:02:49 +0000 Subject: [PATCH 4/5] Only update the search index if either the rendered wikipage or its attributes have changed. We store two maps on the search index: 1. page uid -> rendered content hash 2. page uid -> sorted list of page attrs hash We check the computed hashes of the page against the stored ones before updating the index. https://bugs.corefiling.com/show_bug.cgi?id=6940 --- .../reviki/search/impl/LuceneSearcher.java | 43 +++++++++++++++++-- .../hillsdon/reviki/wiki/RenderedPage.java | 4 ++ .../reviki/webtests/TestSearchMacro.java | 12 ++++++ 3 files changed, 55 insertions(+), 4 deletions(-) diff --git a/src/net/hillsdon/reviki/search/impl/LuceneSearcher.java b/src/net/hillsdon/reviki/search/impl/LuceneSearcher.java index d7fc48f2..c4d35111 100644 --- a/src/net/hillsdon/reviki/search/impl/LuceneSearcher.java +++ b/src/net/hillsdon/reviki/search/impl/LuceneSearcher.java @@ -53,7 +53,6 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; - import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.queryParser.QueryParser.Operator; @@ -109,6 +108,8 @@ public NoQueryPerformedException(final QuerySyntaxException cause) { private static final String[] ALL_SEARCH_FIELDS = new String[] {FIELD_PATH, FIELD_PATH_LOWER, FIELD_TITLE_TOKENIZED, FIELD_CONTENT, FIELD_ATTRIBUTES}; + private final Map _contentHashCodes = new LinkedHashMap(); + private final Map _attrHashCodes = new LinkedHashMap(); private final String _wikiName; private final File _dir; private final List _otherDirs; @@ -162,8 +163,7 @@ public TokenStream tokenStream(final String fieldName, final Reader reader) { return perField; } - private Document createWikiPageDocument(final PageInfo page) throws IOException, PageStoreException { - RenderedPage renderedPage = _renderedPageFactory.create(page, URLOutputFilter.NULL); + private Document createWikiPageDocument(final PageInfo page, final RenderedPage renderedPage) throws IOException, PageStoreException { final String path = page.getPath(); final String wiki = page.getWiki(); final String content = page.getContent(); @@ -226,6 +226,36 @@ private void replaceDocument(final String keyField, final Document document) thr writer.close(); } } + + private boolean isIndexUpToDate(final PageInfo page, final RenderedPage renderedPage) { + Integer hashForContent = getPageContentHashCode(page); + Integer hashForAttrs = getPageAttrHashCode(page); + boolean isContentUpToDate = hashForContent != null && hashForContent.equals(renderedPage.getPageHashCode()); + boolean isAttrsUpToDate = hashForAttrs != null && hashForAttrs.equals(computePageAttrHashCode(page)); + return isContentUpToDate && isAttrsUpToDate; + } + + private Integer getPageContentHashCode(final PageInfo page) { + return _contentHashCodes.get(uidFor(page.getWiki(), page.getPath())); + } + + private void setPageContentHashCode(final PageInfo page, final Integer hashCode) { + _contentHashCodes.put(uidFor(page.getWiki(), page.getPath()), hashCode); + } + + private Integer computePageAttrHashCode(final PageInfo page) { + List attrs = attributesToStringList(page.getAttributes()); + Collections.sort(attrs); + return attrs.hashCode(); + } + + private Integer getPageAttrHashCode(final PageInfo page) { + return _attrHashCodes.get(uidFor(page.getWiki(), page.getPath())); + } + + private void setPageAttrHashCode(final PageInfo page, final Integer hashCode) { + _attrHashCodes.put(uidFor(page.getWiki(), page.getPath()), hashCode); + } // Lucene allows multiple non-deleting readers and at most one writer at a time. // It maintains a lock file but we never want it to fail to take the lock, so serialize writes. @@ -235,7 +265,12 @@ public synchronized void index(final PageInfo page, final boolean buildingIndex) } if (!isIndexBeingBuilt() || buildingIndex) { createIndexIfNecessary(); - replaceWikiDocument(createWikiPageDocument(page)); + RenderedPage renderedPage = _renderedPageFactory.create(page, URLOutputFilter.NULL); + if (!isIndexUpToDate(page, renderedPage)) { + replaceWikiDocument(createWikiPageDocument(page, renderedPage)); + setPageContentHashCode(page, renderedPage.getPageHashCode()); + setPageAttrHashCode(page, computePageAttrHashCode(page)); + } } } diff --git a/src/net/hillsdon/reviki/wiki/RenderedPage.java b/src/net/hillsdon/reviki/wiki/RenderedPage.java index 0bfbf978..d7448451 100644 --- a/src/net/hillsdon/reviki/wiki/RenderedPage.java +++ b/src/net/hillsdon/reviki/wiki/RenderedPage.java @@ -46,6 +46,10 @@ public String getPage() { return _pageName; } + public Integer getPageHashCode() { + return _rendered.hashCode(); + } + /** * @return outgoing links in document order. * @throws IOException If we fail to parse. diff --git a/webtests/net/hillsdon/reviki/webtests/TestSearchMacro.java b/webtests/net/hillsdon/reviki/webtests/TestSearchMacro.java index 8117072c..6b80564d 100644 --- a/webtests/net/hillsdon/reviki/webtests/TestSearchMacro.java +++ b/webtests/net/hillsdon/reviki/webtests/TestSearchMacro.java @@ -106,4 +106,16 @@ public void testBackLinksOnReferencedPage() throws Exception { String findMePageAsText = findMePage.asText(); assertTrue(findMePageAsText.contains(refs)); } + + public void testMatchingAttributesNoChangeToRenderedPage() throws Exception { + String searchingFor = uniqueWikiPageName("SearchMacroSearchingFor"); + editWikiPage(searchingFor, "Should not be found by macro", "", "Search Macro Test", true); + editWikiPage(searchingFor, "Should not be found by macro", "status = completed", "Search Macro Test", false); + String searchingOn = uniqueWikiPageName("SearchMacroTest"); + HtmlPage searchingOnPage = editWikiPage(searchingOn, "Search Macro Results: <>", "", "Search Macro Test", true); + String searchingOnPageAsText = searchingOnPage.asText(); + assertTrue(searchingOnPageAsText.contains("Search Macro Results:")); + assertTrue(searchingOnPageAsText.contains(searchingFor)); + assertSearchFindsPageUsingQuery(searchingOnPage, searchingFor, "path:" + searchingFor); + } } From 4d5e9980324c510f8423733778ececd78950f07e Mon Sep 17 00:00:00 2001 From: Paul Cadman Date: Wed, 26 Mar 2014 10:27:23 +0000 Subject: [PATCH 5/5] Add isIndexUpToDate to SearchEngine to provide a method of checking if the SearchEngine index contains information on the latest rendered copy of a page. This method is now independent from the index method, you only need to check isIndexUpToDate if you're considering indexing a page upon page GET, not page POST or otherwise. https://bugs.corefiling.com/show_bug.cgi?id=6940 --- .../hillsdon/reviki/search/SearchEngine.java | 9 ++++++- .../impl/ExternalCommitAwareSearchEngine.java | 4 ++++ .../reviki/search/impl/LuceneSearcher.java | 24 +++++++++---------- .../impl/SearchIndexPopulatingPageStore.java | 4 +++- 4 files changed, 26 insertions(+), 15 deletions(-) diff --git a/src/net/hillsdon/reviki/search/SearchEngine.java b/src/net/hillsdon/reviki/search/SearchEngine.java index 9f3529cc..b5f0b546 100644 --- a/src/net/hillsdon/reviki/search/SearchEngine.java +++ b/src/net/hillsdon/reviki/search/SearchEngine.java @@ -104,5 +104,12 @@ public interface SearchEngine { * @return A quoted version that escapes any characters that have special significance in the search syntax. */ String escape(String in); - + + /** + * + * @param page Page to check + * @return Boolean indicating whether the page contents are indexed + * @throws IOException On error reading from the search index + */ + boolean isIndexUpToDate(PageInfo page) throws IOException, PageStoreException; } diff --git a/src/net/hillsdon/reviki/search/impl/ExternalCommitAwareSearchEngine.java b/src/net/hillsdon/reviki/search/impl/ExternalCommitAwareSearchEngine.java index a89f0e29..b27fdb20 100644 --- a/src/net/hillsdon/reviki/search/impl/ExternalCommitAwareSearchEngine.java +++ b/src/net/hillsdon/reviki/search/impl/ExternalCommitAwareSearchEngine.java @@ -140,4 +140,8 @@ public Set outgoingLinks(final String page) throws IOException, PageStor return _delegate.outgoingLinks(page); } + public boolean isIndexUpToDate(PageInfo page) throws IOException, PageStoreException { + return _delegate.isIndexUpToDate(page); + } + } diff --git a/src/net/hillsdon/reviki/search/impl/LuceneSearcher.java b/src/net/hillsdon/reviki/search/impl/LuceneSearcher.java index c4d35111..7fce1952 100644 --- a/src/net/hillsdon/reviki/search/impl/LuceneSearcher.java +++ b/src/net/hillsdon/reviki/search/impl/LuceneSearcher.java @@ -227,14 +227,6 @@ private void replaceDocument(final String keyField, final Document document) thr } } - private boolean isIndexUpToDate(final PageInfo page, final RenderedPage renderedPage) { - Integer hashForContent = getPageContentHashCode(page); - Integer hashForAttrs = getPageAttrHashCode(page); - boolean isContentUpToDate = hashForContent != null && hashForContent.equals(renderedPage.getPageHashCode()); - boolean isAttrsUpToDate = hashForAttrs != null && hashForAttrs.equals(computePageAttrHashCode(page)); - return isContentUpToDate && isAttrsUpToDate; - } - private Integer getPageContentHashCode(final PageInfo page) { return _contentHashCodes.get(uidFor(page.getWiki(), page.getPath())); } @@ -266,11 +258,9 @@ public synchronized void index(final PageInfo page, final boolean buildingIndex) if (!isIndexBeingBuilt() || buildingIndex) { createIndexIfNecessary(); RenderedPage renderedPage = _renderedPageFactory.create(page, URLOutputFilter.NULL); - if (!isIndexUpToDate(page, renderedPage)) { - replaceWikiDocument(createWikiPageDocument(page, renderedPage)); - setPageContentHashCode(page, renderedPage.getPageHashCode()); - setPageAttrHashCode(page, computePageAttrHashCode(page)); - } + replaceWikiDocument(createWikiPageDocument(page, renderedPage)); + setPageContentHashCode(page, renderedPage.getPageHashCode()); + setPageAttrHashCode(page, computePageAttrHashCode(page)); } } @@ -555,4 +545,12 @@ public void setIndexBeingBuilt(boolean buildingIndex) throws IOException { public String escape(final String in) { return QueryParser.escape(in); } + + public boolean isIndexUpToDate(final PageInfo page) throws IOException, PageStoreException { + RenderedPage renderedPage = _renderedPageFactory.create(page, URLOutputFilter.NULL); + Integer hashForContent = getPageContentHashCode(page); + Integer hashForAttrs = getPageAttrHashCode(page); + + return hashForContent != null && hashForAttrs != null && hashForContent.equals(renderedPage.getPageHashCode()) && hashForAttrs.equals(computePageAttrHashCode(page)); + } } diff --git a/src/net/hillsdon/reviki/search/impl/SearchIndexPopulatingPageStore.java b/src/net/hillsdon/reviki/search/impl/SearchIndexPopulatingPageStore.java index fc0f9717..86f77253 100644 --- a/src/net/hillsdon/reviki/search/impl/SearchIndexPopulatingPageStore.java +++ b/src/net/hillsdon/reviki/search/impl/SearchIndexPopulatingPageStore.java @@ -70,7 +70,9 @@ public VersionedPageInfo get(PageReference ref, long revision) throws PageStoreE VersionedPageInfo page = super.get(ref, revision); if (!page.isNewPage()) { try { - _indexer.index(page, false); + if (!_indexer.isIndexUpToDate(page)) { + _indexer.index(page, false); + } } catch (IOException e) { LOG.error("Error adding to search index, skipping page: " + page, e);