|
Posted by Toby A Inkster on 07/20/07 07:56
Rik wrote:
> Regex could be the way to go.
Argh! No! That way lies nightmares. Get the XML_HTMLSax3 class from PEAR
and use that.
Here's an example that should parse TR, TD and TH tags (ignoring others)
including ROWSPAN and COLSPAN attributes. It creates an array of arrays
representing rows of cells. It uses 0-based indices.
<?php
class TableParser
{
private $currow = -1;
private $curcol = -1;
private $shape = array();
private $data = array();
public function openHandler ($parser, $tag, $attrs)
{
$tag = strtolower($tag);
// Move to the correct cell co-ordinates.
if ($tag=='tr')
{
$this->currow++;
$this->curcol = -1;
}
elseif ($tag=='td'||$tag=='th')
{
$this->curcol++;
}
// This should account for rowspan and colspan.
while ($this->shape[$this->currow][$this->curcol])
$this->curcol++;
$rowspan = 1;
$colspan = 1;
foreach ($attrs as $k=>$v)
{
$k = strtolower($k);
if ($k=='rowspan')
$rowspan=(int)$v;
elseif ($k=='colspan')
$colspan=(int)$v;
}
for ($i=0; $i<$rowspan; $i++)
for ($j=0; $j<$colspan; $j++)
{
$x = $this->currow + $i;
$y = $this->curcol + $j;
if ($this->shape[$x][$y])
error_log('Overlap!');
$this->shape[$x][$y] = TRUE;
}
}
public function closeHandler ($parser, $tag)
{
}
public function dataHandler ($parser, $data)
{
$this->data[$this->currow][$this->curcol] .= $data;
}
public function getData ()
{
unset($this->data[-1]);
foreach ($this->data as $k=>$v)
unset($this->data[$k][-1]);
return $this->data;
}
}
include 'XML/HTMLSax3.php';
$sax = new XML_HTMLSax3;
$hdlr = new TableParser;
$sax->set_object($hdlr);
$sax->set_element_handler('openHandler', 'closeHandler');
$sax->set_data_handler('dataHandler');
$sax->parse('
<table>
<tr>
<td rowspan="2">Test table lalala</td>
<td>123</td>
<td>456</td>
</tr>
<tr>
<td>789</td>
<td>ABC</td>
</tr>
<tr>
<td colspan="2" rowspan="2">123</td>
<td>456</td>
</tr>
<tr>
<td>789</td>
</tr>
</table>
');
print_r($hdlr->getData());
?>
--
Toby A Inkster BSc (Hons) ARCS
[Geek of HTML/SQL/Perl/PHP/Python/Apache/Linux]
[OS: Linux 2.6.12-12mdksmp, up 29 days, 10:43.]
PHP Domain Class
http://tobyinkster.co.uk/blog/2007/07/19/php-domain-class/
[Back to original message]
|