Common SQL statements for ht://Check
------------------------------------

Copyright (c) 1999-2004 Comune di Prato - Prato - Italy
Some Portions Copyright (c) 1995-2003 The ht://Dig Group <www.htdig.org>
Author: Gabriele Bartolini - Prato - Italy <angusgb@users.sourceforge.net>
$Id: SQL,v 1.11 2003/12/30 09:38:29 angusgb Exp $

ht://Check is distributed under the GNU General Public License (GPL).
See the COPYING file for license information.

ht://Check is a world-wide-web utility for an intranet or small internet.

-------------------------------------------------------------------
Retrieve all the anchors (A name="anchorname"):

 SELECT HtmlAttribute.IDUrl, HtmlAttribute.Content FROM HtmlAttribute,
  HtmlStatement WHERE  HtmlStatement.IDUrl=HtmlAttribute.IDUrl AND 
  HtmlAttribute.TagPosition=HtmlStatement.TagPosition AND
  HtmlAttribute.Attribute='name' AND HtmlStatement.Tag='a';

Retrieve all the anchor found and not after LinkResult has been set by
 the program with the Scheduler::SetLinkResults() method:
 SELECT DISTINCT Link.IDUrlSrc, Link.IDUrlDest, Link.TagPosition,
  Link.AttrPosition, Link.Anchor, TmpAnchors.IDUrl
  FROM Link LEFT JOIN TmpAnchors ON
  Link.IDUrlDest=TmpAnchors.IDUrl
  AND TmpAnchors.Anchor=Link.Anchor
  WHERE Link.Anchor != '' AND Link.LinkResult='OK'
  
Retrieve all the linked URL:

 SELECT Link.*, Url.StatusCode
  FROM Link LEFT JOIN Url on Link.IDUrlDest=Url.IDUrl
  WHERE Link.LinkResult='NotChecked' 
  
Retrieve all of the groups of LinkResult with the retrieval Status:
 SELECT Link.LinkResult, Schedule.Status, COUNT(*)
  FROM Link, Schedule
  WHERE Link.IDUrlDest=Schedule.IDUrl
  GROUP BY Link.LinkResult, Schedule.Status

Retrieve all of the groups of LinkResult:
 SELECT Link.LinkResult, COUNT(*)
  FROM Link GROUP BY Link.LinkResult

Retrieve all the broken links (1st version - Without redirected URLs, but
 with HtmlAttribute too):
 SELECT UrlSrc.IDUrl as IDUrlSrc, UrlDest.IDUrl as IDUrlDest,
  UrlSrc.Url as UrlSrc, UrlDest.Url as UrlDest,
  UrlDest.StatusCode, UrlDest.ReasonPhrase,
  UrlDest.ConnStatus, Link.LinkType,
  HtmlStatement.Statement, HtmlAttribute.Attribute, HtmlAttribute.Content
  FROM Url UrlDest, Url UrlSrc, Link, HtmlStatement, HtmlAttribute
  WHERE Link.LinkResult = 'Broken'
  AND HtmlStatement.IDUrl = Link.IDUrlSrc
  AND HtmlStatement.TagPosition = Link.TagPosition
  AND HtmlAttribute.IDUrl = Link.IDUrlSrc
  AND HtmlAttribute.TagPosition = Link.TagPosition
  AND HtmlAttribute.AttrPosition = Link.AttrPosition
  AND UrlSrc.IDUrl = Link.IDUrlSrc
  AND UrlDest.IDUrl = Link.IDUrlDest
  ORDER BY UrlSrc, UrlDest, Link.TagPosition, Link.AttrPosition

Retrieve all the broken links (2nd version with redirected Urls but withou
 HtmlAttribute table):
 SELECT UrlSrc.IDUrl as IDUrlSrc, UrlDest.IDUrl as IDUrlDest,
  UrlSrc.Url as UrlSrc, UrlDest.Url as UrlDest,
  UrlDest.StatusCode, UrlDest.ReasonPhrase,
  UrlDest.ConnStatus, Link.LinkType,
  HtmlStatement.Statement
  FROM Url UrlDest, Url UrlSrc, Link
  LEFT JOIN HtmlStatement
  ON HtmlStatement.IDUrl = Link.IDUrlSrc
  AND HtmlStatement.TagPosition = Link.TagPosition
  WHERE Link.LinkResult = 'Broken'
  AND UrlSrc.IDUrl = Link.IDUrlSrc
  AND UrlDest.IDUrl = Link.IDUrlDest
  ORDER BY UrlSrc, UrlDest, Link.TagPosition, Link.AttrPosition

Retrieve all the anchors not found:
 SELECT UrlSrc.IDUrl as IDUrlSrc, UrlDest.IDUrl as IDUrlDest,
  UrlSrc.Url as UrlSrc, UrlDest.Url as UrlDest,
  Link.LinkType, Link.Anchor, HtmlStatement.Statement
  FROM Url UrlDest, Url UrlSrc, Link
  LEFT JOIN HtmlStatement
  ON HtmlStatement.IDUrl = Link.IDUrlSrc
  AND HtmlStatement.TagPosition = Link.TagPosition
  WHERE Link.LinkResult = 'AnchorNotFound'
  AND UrlSrc.IDUrl = Link.IDUrlSrc
  AND UrlDest.IDUrl = Link.IDUrlDest
  ORDER BY UrlSrc, UrlDest, Link.TagPosition, Link.AttrPosition

Retrieve all the Urls linked to from inside a URL pattern outside
  SELECT Source.Url, Dest.Url, Statement
  FROM Url Source, Schedule Dest, Link
  LEFT JOIN HtmlStatement
  ON HtmlStatement.IDUrl = Link.IDUrlSrc
  AND HtmlStatement.TagPosition = Link.TagPosition
  WHERE Dest.IDUrl = Link.IDUrlDest
  AND Source.IDUrl = Link.IDUrlSrc
  AND Source.Url like 'pattern%'
  AND Dest.Url not like 'pattern%'
  
Retrieve filenames lengths
  SELECT Url, instr(reverse(Url),'/') -1 as FileLength,
  instr(reverse(Url),'.') -1 as ExtensionLength,
  instr(reverse(Url),'/') - instr(reverse(Url),'.') as NameLength
  FROM Url

Retrieve all the documents that don't respect a filename lenght
 of 8 chars and an extension bigger than 3 chars (not query string):
  SELECT Url
  FROM Url
  WHERE Url NOT REGEXP '^.*/(([A-Za-z0-9_-]{1,8}\.[A-Za-z]{1,3})?|.*\\?.+)$' 

Retrieve all the image reference tags which both found and not found
 image alternative files.
SELECT Url.Url, HtmlStatement.Statement, HtmlAttribute.Attribute
 FROM HtmlStatement, Url
 LEFT JOIN HtmlAttribute ON
 HtmlStatement.IDUrl = HtmlAttribute.IDUrl
  AND HtmlStatement.TagPosition = HtmlAttribute.TagPosition
  AND HtmlAttribute.Attribute = 'ALT'
 WHERE Tag='IMG'
  AND Url.IDUrl = HtmlStatement.IDUrl
 ORDER BY Attribute, Url, HtmlStatement.TagPosition


-------------------------------------------------------------------
