Main Page | Report this Page
 
.NET DotNet Forum Index  »  Visual C++ Forum  »  Extracting text from HTML pages...
Page 1 of 1    

Extracting text from HTML pages...

Author Message
Naveen HS...
Posted: Tue Oct 20, 2009 5:54 am
Guest
Hello Everyone,

I am trying to extract the data from HTML pages from TABLE tag.

I am able to traverse the table tag, but i am not able to extract the data
from the table can anyone please help with this.



void CTestDlg::OnBgo()
{

UpdateData();
CWaitCursor wait;
if(m_csFilename.IsEmpty()){
AfxMessageBox(_T("Please specify the file to parse"));
return;
}
CFile f;

//let's open file and read it into CString (u can use any buffer to read
though
if (f.Open(m_csFilename, CFile::modeRead|CFile::shareDenyNone)) {
m_wndLinksList.ResetContent();
CString csWholeFile;
f.Read(csWholeFile.GetBuffer(f.GetLength()), f.GetLength());
csWholeFile.ReleaseBuffer(f.GetLength());
f.Close();

//declare our MSHTML variables and create a document
MSHTML::IHTMLDocument2Ptr pDoc;
MSHTML::IHTMLDocument3Ptr pDoc3;
MSHTML::IHTMLElementCollectionPtr pCollection;
MSHTML::IHTMLElementPtr pElement;

HRESULT hr = CoCreateInstance(CLSID_HTMLDocument, NULL,
CLSCTX_INPROC_SERVER,
IID_IHTMLDocument2, (void**)&pDoc);

//put the code into SAFEARRAY and write it into document
SAFEARRAY* psa = SafeArrayCreateVector(VT_VARIANT, 0, 1);
VARIANT *param;
bstr_t bsData = (LPCTSTR)csWholeFile;
hr = SafeArrayAccessData(psa, (LPVOID*)&param);
param->vt = VT_BSTR;
param->bstrVal = (BSTR)bsData;

hr = pDoc->write(psa);
hr = pDoc->close();

SafeArrayDestroy(psa);
pDoc3 = pDoc;

pDoc->get_all(&pCollection);

pCollection = pDoc3->getElementsByTagName("table");


for(long i=0; i<pCollection->length; i++){
pElement = pCollection->item(i, (long)0);
if(pElement != NULL){

//m_wndLinksList.AddString("Hello");
m_wndLinksList.AddString((LPCTSTR)bstr_t(pElement->getAttribute("table"),
10));
}
}

}
}

url:http://www.ureader.com/gp/1452-1.aspx
 
 
Page 1 of 1    
All times are GMT - 5 Hours
The time now is Sat Nov 21, 2009 9:40 pm