I'm working on scraping HTML pages and transforming the selected content from them into XML and I've been seeing a few errors here and there thrown by the Saxon XSLT parser:
WARN [2011-01-07 14:44:35 EST] (SaxonTranslator#429473) improper java log: XPTY0004: A sequence of more than one item is not allowed as the first argument of
WARN [2011-01-07 14:44:35 EST] (SaxonTranslator#429473) improper java log: matches() ("<!--//<![CDATA[ var m...", "<!--//<![CDATA[ va...", ...)
ERROR [2011-01-07 14:44:35 EST] (SaxonTranslator#429473) error while processing file 'ffd827cfd0f59a793ba99a309b392e40.substituted': Saxon error...A sequence of more than one item is not allowed as the first argument of matches() ("<!-- google_ad_client = "ca-...", "", ...) (~/sandbox/core_plugins/JavaPlugin.rb:131:in `execute')
The same with type of error with ("<!--//<![CDATA[ var m...", "<!--//<![CDATA[ va...", ...)
Also one with ("...", "", ...)
Here is the bit of the file where I believe the problem is occurring:
<?xml version="1.0" encoding="ISO-8859-1"?>
<xsl:stylesheet
xmlns:exsl="http://exslt.org/common"
xmlns:str="http://exslt.org/strings"
xmlns:date="http://exslt.org/dates-and-times"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:html="http://www.w3.org/1999/xhtml"
extension-element-prefixes="exsl str date"
version="2.0">
<xsl:import href="utils.xslt"/>
<xsl:output method="xml" indent="yes"/>
<xsl:param name="feed" select="rand"/>
<xsl:template match="/">
<!-- Regular expression to find dates in the file -->
<xsl:variable name="date_regex"
select="'((Jaunary|February|March|April|May|June|July|August|September|October|November|December) \d?\d, \d\d\d\d)'"/>
<p:Report>
<p:PublicationMetadata>
<p:AdministrativeMetadata>
<!-- DATE -->
<p:DateList>
<xsl:choose>
<xsl:when test="matches(//body, $date_regex)">
<!-- Sometimes the text of an article contains a date. To prevent the extra -->
<!-- date from being included in the output, we save the regex results to a -->
<!-- sequence-valued variable, then use only the first entry. -->
<xsl:variable name="regex_result" as="xs:string*">
<xsl:analyze-string regex="{$date_regex}" select="//body">
<xsl:matching-substring>
<xsl:sequence select="."/>
</xsl:matching-substring>
</xsl:analyze-string>
</xsl:variable>
<p:DatePublished>
<xsl:call-template name="FormatDate">
<xsl:with-param name="docDate" select="$regex_result[1]"/>
</xsl:call-template>
</p:DatePublished>
<p:DatePosted>
<xsl:call-template name="FormatDate">
<xsl:with-param name="docDate" select="$regex_result[1]"/>
</xsl:call-template>
</p:DatePosted>
</xsl:when>
<xsl:otherwise>
<!-- Use current date if the document doesn't have a date field -->
<!-- Returns format yyyy-mm-dd -->
<xsl:variable name="curdate">
<xsl:value-of select="date:date()"/>
</xsl:variable>
<p:DatePublished>
<xsl:value-of select="$curdate"/>
<xsl:text>T12:00:00Z</xsl:text>
</p:DatePublished>
<p:DatePosted>
<xsl:value-of select="$curdate"/>
<xsl:text>T12:00:00Z</xsl:text>
</p:DatePosted>
</xsl:otherwise>
</xsl:choose>
</p:DateList>
<!-- ...Rest of the file... -->
</xsl:stylesheet>