I'm working on scraping HTML pages and transforming the selected content from them into XML and I've been seeing a few errors here and there thrown by the Saxon XSLT parser:
WARN [2011-01-07 14:44:35 EST] (SaxonTranslator#429473) improper java log: XPTY0004: A sequence of more than one item is not allowed as the first argument of
WARN [2011-01-07 14:44:35 EST] (SaxonTranslator#429473) improper java log: matches() ("<!--//<![CDATA[ var m...", "<!--//<![CDATA[ va...", ...)
ERROR [2011-01-07 14:44:35 EST] (SaxonTranslator#429473) error while processing file 'ffd827cfd0f59a793ba99a309b392e40.substituted': Saxon error...A sequence of more than one item is not allowed as the first argument of matches() ("<!-- google_ad_client = "ca-...", "", ...) (~/sandbox/core_plugins/JavaPlugin.rb:131:in `execute')
The same with type of error with ("<!--//<![CDATA[ var m...", "<!--//<![CDATA[ va...", ...)
Also one with ("...", "", ...)
Here is the bit of the file where I believe the problem is occurring:
<?xml version="1.0" encoding="ISO-8859-1"?>
extension-element-prefixes="exsl str date"
<xsl:import href="utils.xslt"/>
<xsl:output method="xml" indent="yes"/>
<xsl:param name="feed" select="rand"/>
<xsl:template match="/">
<!-- Regular expression to find dates in the file -->
<xsl:variable name="date_regex"
select="'((Jaunary|February|March|April|May|June|July|August|September|October|November|December) \d?\d, \d\d\d\d)'"/>
<!-- DATE -->
<xsl:when test="matches(//body, $date_regex)">
<!-- Sometimes the text of an article contains a date. To prevent the extra -->
<!-- date from being included in the output, we save the regex results to a -->
<!-- sequence-valued variable, then use only the first entry. -->
<xsl:variable name="regex_result" as="xs:string*">
<xsl:analyze-string regex="{$date_regex}" select="//body">
<xsl:sequence select="."/>
<xsl:call-template name="FormatDate">
<xsl:with-param name="docDate" select="$regex_result[1]"/>
<xsl:call-template name="FormatDate">
<xsl:with-param name="docDate" select="$regex_result[1]"/>
<!-- Use current date if the document doesn't have a date field -->
<!-- Returns format yyyy-mm-dd -->
<xsl:variable name="curdate">
<xsl:value-of select="date:date()"/>
<xsl:value-of select="$curdate"/>
<xsl:value-of select="$curdate"/>
<!-- ...Rest of the file... -->