|
Posted by Cleverbum on 10/27/06 09:40
I was hoping to parse a webpage and extract all link and image URLs
from it and enter the new ones into mySQL tables, below is my code to
do it, I've tried to optimise it as much as I can but it still takes
too long to execute (server timeouts on a server which I can not
control) I was wondering if there was some way to compile the code or
if anyone can spot something which could be better written.
Thanks in advance,
Martin
<?
$start=microtime_float();
$uid=$_GET['uid'];
$restriction= "dontstayin";
include("loadstuff.php"); //just contains a function which echos a tick
or cross
include("userpass.php"); //contains database access details.
mysql_connect($host,$user,$password);
@mysql_select_db("$database") or die(cross());
$result = mysql_query("SELECT * FROM mark_toparse WHERE uid='$uid'");
//mysql_close();
$contents = mysql_result($result,0,"fcontents");
$fullorig = mysql_result($result,0,"originalpath");
$origpath = substr($origpath,0,strrpos($fullorig,"/")+1);
$lines = explode(">",$contents);
for($i=0;$i<count($lines);$i++){
$imgsrc = stristr($lines[$i],"<img");
if($imgsrc!=false){
$imgsrc = str_replace("'","",$imgsrc);
$f = strpos($imgsrc,"\"",strpos($imgsrc,"src"));
$l = strpos($imgsrc,"\"",$f+1);
$url = substr($imgsrc,$f+1,$l-$f-1);
if(strncasecmp($url,"http:",5)!=0){
$url = $origpath . $url;
$url= str_replace(":/","://",str_replace("//","/",$url));
}
$resone = mysql_query("SELECT count(*) FROM mark_images WHERE
url='$url'");
$a = (mysql_fetch_array($resone,MYSQL_NUM));
if($a[0]==0){
mysql_query("INSERT INTO mark_images VALUES('','$url')");
}
}else{
$link = stristr($lines[$i],"href=");
if($link!=false){
$link = str_replace("'","",$link);
$f = strpos($link,"\"");
$l = strpos($link,"\"",$f+1);
$url= substr($link,$f+1,$l-$f-1);
if(strncasecmp($url,"http:",5)!=0){
if(strncasecmp($url,"mailto:",7)==0||strncasecmp($url,"ftp:",4)==0||strncasecmp($url,"msnim:",6)==0){
//ignore ftp, mailto and msn links
}else{
$url = $origpath . $url;
$url= str_replace(":/","://",str_replace("//","/",$url));
if(eregi("\.jp[eg2]{1,2}$",$url)){
$resone = mysql_query("SELECT count(*) FROM mark_images WHERE
url='$url'");
$a = (mysql_fetch_array($resone,MYSQL_NUM));
if($a[0]==0){
mysql_query("INSERT INTO mark_images VALUES('','$url')");
}
}else{
$resone = mysql_query("SELECT count(*) FROM mark_linktodl WHERE
url='$url'");
$a = (mysql_fetch_array($resone,MYSQL_NUM));
if($a[0]==0){
mysql_query("INSERT INTO mark_linktodl VALUES('','$url')");
}
}
}
}elseif(eregi($restriction,$url)){
if(eregi("\.jp[eg2]{1,2}$",$url)){
$resone = mysql_query("SELECT count(*) FROM mark_images WHERE
url='$url'");
$a = (mysql_fetch_array($resone,MYSQL_NUM));
if($a[0]==0){
mysql_query("INSERT INTO mark_images VALUES('','$url')");
}
}else{
$resone = mysql_query("SELECT count(*) FROM mark_linktodl WHERE
url='$url'");
$a = (mysql_fetch_array($resone,MYSQL_NUM));
if($a[0]==0){
mysql_query("INSERT INTO mark_linktodl VALUES('','$url')");
}
}
}
}
}
}
//mysql_query("DELETE FROM mark_toparse WHERE
originalpath='$fullorig'");
mysql_query("INSERT INTO mark_parsed
VALUES('','$fullorig','".md5($contents)."')");
mysql_close();
$finish=microtime_float();
if(strcmp($_GET['debug'],"t")==0){
$tim=$finish-$start;
include("error_image.php");
echo imagepng(errorimage("Analysis: $tim s"));
}else{
echo tick();
}
function microtime_float()
{
list($usec, $sec) = explode(" ", microtime());
return ((float)$usec + (float)$sec);
}
?>
Navigation:
[Reply to this message]
|