Skip to content

Commit

Permalink
Removing dictionary file support (in support of #278)
Browse files Browse the repository at this point in the history
Note that I'm smuggling this in with the config cin support changes, since this requires changes to the config. However, ODD and documentation changes still need to be done, so can't be resolved without further work.
  • Loading branch information
joeytakeda committed Nov 17, 2023
1 parent 4abbab7 commit 11efc76
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 26 deletions.
33 changes: 23 additions & 10 deletions xsl/create_config_xsl.xsl
Original file line number Diff line number Diff line change
Expand Up @@ -348,10 +348,8 @@

<!--First, create the global variables and parameters-->
<xsl:call-template name="createGlobals" exclude-result-prefixes="#all"/>

<!--Now create the dictionary XML files-->
<xsl:call-template name="createDictionaryXML" exclude-result-prefixes="#all"/>

<xsl:call-template name="createStopwordsXML" exclude-result-prefixes="#all"/>

<xsl:for-each select="$rules">
<xso:template match="{@match}" priority="{$PRIORITY_THIRD}" mode="decorate">
Expand Down Expand Up @@ -583,9 +581,6 @@
</xso:param>
</xsl:for-each>




<!-- We record the current default stemmer folder. -->
<xso:param name="defaultStemmerFolder"><xsl:value-of select="$ssDefaultStemmerFolder"/></xso:param>

Expand Down Expand Up @@ -686,9 +681,27 @@
</xso:template>
</xsl:template>




<xd:doc>
<xd:desc>Template to create an XML representation of the stopwords file
and an associated key</xd:desc>
</xd:doc>
<xsl:template name="createStopwordsXML">
<xsl:for-each select="($configDoc//stopwordsFile)">
<xsl:variable name="path" select="resolve-uri(text(),$configUri)"/>
<xsl:variable name="uri" select="concat($outDir,'/dicts/',substring-before(tokenize($path,'/')[last()],'.txt'),'.xml')"/>
<xsl:result-document href="{$uri}" method="xml">
<hcmc:words>
<xsl:for-each select="tokenize(unparsed-text($path),'\s+')">
<hcmc:word><xsl:value-of select="lower-case(normalize-space(.))"/></hcmc:word>
</xsl:for-each>
</hcmc:words>
</xsl:result-document>
<xsl:variable name="docFn">doc('<xsl:value-of select="$uri"/>')</xsl:variable>
<xso:variable name="{concat(local-name(),'Xml')}" select="{$docFn}"/>
</xsl:for-each>
<xso:key name="w" match="hcmc:word" use="."/>
</xsl:template>
<!--
<xd:doc>
<xd:desc>Template to create an XML representation of the dictionary file
and an associated key.</xd:desc>
Expand All @@ -709,7 +722,7 @@
</xsl:for-each>
<xso:key name="w" match="hcmc:word" use="."/>
</xsl:template>
</xsl:template>-->



Expand Down
32 changes: 16 additions & 16 deletions xsl/create_reports.xsl
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@
<xsl:call-template name="createFilters"/>
<xsl:call-template name="createExcludes"/>
<xsl:if test="$verboseReport = 'true'">
<xsl:call-template name="createNonDictionaryList"/>
<!-- <xsl:call-template name="createNonDictionaryList"/>-->
<xsl:call-template name="createForeignWordList"/>
</xsl:if>
</div>
Expand Down Expand Up @@ -378,7 +378,7 @@
</xsl:if>
</xsl:template>

<xd:doc>
<!-- <xd:doc>
<xd:desc>Template for creating the "Not in Dictionary" list. While a term's exclusion
from the dictionary doesn't change the search results, this report is helpful for catching
typos in your document collection. </xd:desc>
Expand All @@ -388,29 +388,29 @@
<section>
<h2>Words Not In Dictionary</h2>
<!--Only check stems that are words-->
<!-\-Only check stems that are words-\->
<xsl:variable name="stemsToCheck" select="$spans[not(matches(@ss-stem,'\d'))][not(hcmc:isForeign(.))]" as="element(span)*"/>
<!--Retrieve the outermost spans so we don't include the nested spans from hyphenated terms
(we process those a bit differently) -->
<!-\-Retrieve the outermost spans so we don't include the nested spans from hyphenated terms
(we process those a bit differently) -\->
<xsl:variable name="outermostStems" select="outermost($stemsToCheck)" as="element(span)*"/>
<xsl:variable name="wordsNotInDictionaryMap" as="map(xs:string, element(span)*)">
<xsl:map>
<!--Group by whether or not it has descendant spans-->
<!-\-Group by whether or not it has descendant spans-\->
<xsl:for-each-group select="$outermostStems" group-by="exists(child::span[@ss-stem])">
<xsl:choose>
<!--If this thing has child stems, it's a hyphenated construct
and so we check each child term individually-->
<!-\-If this thing has child stems, it's a hyphenated construct
and so we check each child term individually-\->
<xsl:when test="current-grouping-key()">
<!--Now iterate through all of the hyphenated spans-->
<!-\-Now iterate through all of the hyphenated spans-\->
<xsl:for-each-group select="current-group()" group-by="string(.)">
<!--Stash the word-->
<!-\-Stash the word-\->
<xsl:variable name="term" select="current-grouping-key()"/>
<!--Stash the current context-->
<!-\-Stash the current context-\->
<xsl:variable name="hyphenatedSpan" select="current-group()[1]" as="element(span)"/>
<!--Not in dictionary spans-->
<!-\-Not in dictionary spans-\->
<xsl:variable name="words"
select="for $s in $hyphenatedSpan/span[@ss-stem] return lower-case(string($s))"
as="xs:string*"/>
Expand All @@ -428,7 +428,7 @@
</xsl:for-each-group>
</xsl:when>
<xsl:otherwise>
<!--Group by string value (so basically just distinct values)-->
<!-\-Group by string value (so basically just distinct values)-\->
<xsl:for-each-group select="current-group()" group-by="hcmc:cleanWordForStemming(lower-case(string(.)))">
<xsl:variable name="word" select="current-grouping-key()" as="xs:string"/>
<xsl:if test="not(hcmc:isInDictionary($word))">
Expand Down Expand Up @@ -483,9 +483,9 @@
</xsl:choose>
</details>
</section>
</xsl:template>
</xsl:template>-->

<xd:doc>
<!-- <xd:doc>
<xd:desc><xd:ref name="hcmc:isInDictionary">hcmc:isInDictionary</xd:ref> checks
whether or not a word is in the provided dictionary. This is basically just a wrapper
around the key() function, but we take advantage of Saxon 10HE's memo-function capabilities
Expand All @@ -495,7 +495,7 @@
<xsl:function name="hcmc:isInDictionary" new-each-time="no" as="xs:boolean">
<xsl:param name="word" as="xs:string"/>
<xsl:sequence select="exists(key('w', $word, $dictionaryFileXml))"/>
</xsl:function>
</xsl:function>-->

<xd:doc>
<xd:desc>Template to create a report of all "foreign" words in the collection:
Expand Down

0 comments on commit 11efc76

Please sign in to comment.