top button
Flag Notify
    Connect to us
      Site Registration

Site Registration

parsing html in Perl [CLOSED]

+1 vote
477 views

What would be the best module available for parsing html?
My intention is to parse html that contains a table of 5 columns and any number of rows, and have a hash ref like $html->{1}->{col1}=data11, $html->{1}->{col2}=data12 ...$html->{2}->{col1}=data21, $html->{2}->{col2}=data22...etc
Would there be an existing module that can do this without too much effort on my part?

closed with the note: none
posted Aug 8, 2013 by Meenal Mishra

Share this question
Facebook Share Button Twitter Share Button LinkedIn Share Button

3 Answers

+1 vote

For parsing HTML tables, you want HTML::TableExtract, IMO.
https://metacpan.org/module/HTML::TableExtract

It makes life easy.

answer Aug 8, 2013 by Abhay Kulkarni
0 votes

Have a look at HTML::PARSER.

answer Aug 8, 2013 by Luv Kumar
0 votes

I would also say look at HTML::TreeBuilder

answer Aug 8, 2013 by Seema Siddique
Similar Questions
+1 vote

I have tried to install module PDF::FromHTML from cpan but not able to get it.
Please let me know any other module to do the same. please help

+2 votes

This is the xml file, I need to parse and store data into database using java some nodes are missing during parsing.

<Products>
<Product>
<ProductURLs>
<ProductURL>http://www.partner.viator.com/en/13689/tours/Rome/Skip-the-Line-Vatican-Museums-Walking-Tour-including-Sistine-Chapel-Raphael-s-Rooms-and-St-Peter-s/d511-3731VATICAN</ProductURL>
</ProductURLs>
<ProductStarRating>
<AvgRating>4.5</AvgRating>
<AvgRatingStarURL>http://www.partner.viator.com/images/stars/red/17-4_5.gif</AvgRatingStarURL>
</ProductStarRating>
<IATAcode>Rome</IATACode>
<BookingType>FreesaleOnRequest</BookingType>
<VoucherOption>VOUCHER_E</VoucherOption>
<ProductStarRating>
<AvgRating>4.5</AvgRating>
<AvgRatingStarURL>http://www.partner.viator.com/images/stars/red/17-4_5.gif</AvgRatingStarURL>
</ProductStarRating>
</Product>
<Product>
<ProductURLs>
<ProductURL>http://www.partner.viator.com/en/13689/tours/Rome/Skip-the-Line-Vatican-Museums-Walking-Tour-including-Sistine-Chapel-Raphael-s-Rooms-and-St-Peter-s/d511-3731VATICAN</ProductURL>
</ProductURLs>
<ProductStarRating>
<AvgRating>4.5</AvgRating>
<AvgRatingStarURL>http://www.partner.viator.com/images/stars/red/17-4_5.gif</AvgRatingStarURL>
</ProductStarRating>
<IATAcode>Rome</IATACode>
<BookingType>FreesaleOnRequest</BookingType>
<VoucherOption>VOUCHER_E</VoucherOption>
</Product>
<Product>
<ProductURLs>
<ProductURL>http://www.partner.viator.com/en/13689/tours/Rome/Skip-the-Line-Vatican-Museums-Walking-Tour-including-Sistine-Chapel-Raphael-s-Rooms-and-St-Peter-s/d511-3731VATICAN</ProductURL>
</ProductURLs>
<ProductStarRating>
<AvgRating>4.5</AvgRating>
<AvgRatingStarURL>http://www.partner.viator.com/images/stars/red/17-4_5.gif</AvgRatingStarURL>
</ProductStarRating>
<BookingType>FreesaleOnRequest</BookingType>
<VoucherOption>VOUCHER_E</VoucherOption>
</Product>
</Products>

this is my java code

package test;

import javax.print.attribute.standard.Destination;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.w3c.dom.Node;
import org.w3c.dom.Element;
import org.w3c.dom.Text;

import com.mysql.jdbc.PreparedStatement;
import java.io.File;
import java.io.StringWriter;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.Statement;


public class Test3 {

    public static void main(String[] args) {

        try {
            PreparedStatement pstatement = null;
            Class.forName("com.mysql.jdbc.Driver");
            Connection con = DriverManager.getConnection("jdbc:mysql://localhost:3306/test?useUnicode=true&characterEncoding=UTF-8", "root", "passwurd");
            Statement st=con.createStatement();
            File fXmlFile = new File("E:/xml/xml/test.xml");
            DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
            DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
            Document doc = dBuilder.parse(fXmlFile);

            //optional, but recommended
            //read this - http://stackoverflow.com/questions/13786607/normalization-in-dom-parsing-with-java-how-does-it-work
            doc.getDocumentElement().normalize();

            System.out.println("Root element :" + doc.getDocumentElement().getNodeName());

            NodeList nList1 = doc.getElementsByTagName("Product");

            System.out.println("----------------------------");

            for (int temp = 0; temp < nList1.getLength(); temp++) {

                Node nNode = nList1.item(temp);

                //System.out.println("\nCurrent Element :" + nNode.getNodeName());  

                Element eElement = (Element) nNode;
                //System.out.println("Rank : " + eElement.getElementsByTagName("Rank").item(0).getTextContent());

                if(eElement.getElementsByTagName("IATACode")  == null || eElement.getElementsByTagName("IATACode").getLength() <1 ) 
                {

                    System.out.println("2");
                    String IATA_CODE="";
                    String queryString = "INSERT INTO VIATOR_PRODUCTS(IATA_CODE) VALUES (?)";
                    pstatement = (PreparedStatement) con.prepareStatement(queryString);
                    pstatement.setString(1, IATA_CODE);

                    pstatement.executeUpdate(); 
                }
                else if(eElement.getElementsByTagName("IATACode")  != null && eElement.getElementsByTagName("IATACode").getLength() >0 || eElement.getElementsByTagName("AvgRating")  != null && eElement.getElementsByTagName("AvgRating").getLength() >0) 
                {
                    System.out.println("1");
                    String IATA_CODE=eElement.getElementsByTagName("IATACode").item(0).getTextContent();
                    //  String AVG_RATING_STAR_URL=eElement.getElementsByTagName("AvgRatingStarURL").item(0).getTextContent();
                    String  AVG_RATING=eElement.getElementsByTagName("AvgRating").item(0).getTextContent();

                    //String IATA_CODE=eElement.getElementsByTagName("IATACode").item(0).getTextContent();

                    //  String IATA_CODE=eElement.getElementsByTagName("IATACode").item(0).getTextContent();
                    String queryString = "INSERT INTO VIATOR_PRODUCTS(IATA_CODE,AVG_RATING) VALUES (?,?)";
                    pstatement = (PreparedStatement) con.prepareStatement(queryString);
                    pstatement.setString(1, IATA_CODE);
                    //  pstatement.setString(2, AVG_RATING_STAR_URL);
                    pstatement.setString(2, AVG_RATING);
                    pstatement.executeUpdate(); 
                }
                else 
                {   
                    System.out.println("3");
                    String  AVG_RATING="";

                    String queryString = "INSERT INTO VIATOR_PRODUCTS(AVG_RATING) VALUES (?)";
                    pstatement = (PreparedStatement) con.prepareStatement(queryString);
                    pstatement.setString(1, AVG_RATING);

                    pstatement.executeUpdate(); 
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}
0 votes

As I understand, Perl has excellent report-generation capabilities. By using formats, we can actually visualize how our output will look because the definition of a format in Perl is very similar to what you see on the output. Is there any way we to convert these formats into HTML reports? My goal is to create good looking HTML reports. Please suggest/advice.

...