| |
 |
|
| .NET DotNet Forum Index » Visual C++ Forum » Extracting text from HTML pages... |
|
Page 1 of 1 |
|
| Author |
Message |
| Naveen HS... |
Posted: Tue Oct 20, 2009 5:54 am |
|
|
|
Guest
|
Hello Everyone,
I am trying to extract the data from HTML pages from TABLE tag.
I am able to traverse the table tag, but i am not able to extract the data
from the table can anyone please help with this.
void CTestDlg::OnBgo()
{
UpdateData();
CWaitCursor wait;
if(m_csFilename.IsEmpty()){
AfxMessageBox(_T("Please specify the file to parse"));
return;
}
CFile f;
//let's open file and read it into CString (u can use any buffer to read
though
if (f.Open(m_csFilename, CFile::modeRead|CFile::shareDenyNone)) {
m_wndLinksList.ResetContent();
CString csWholeFile;
f.Read(csWholeFile.GetBuffer(f.GetLength()), f.GetLength());
csWholeFile.ReleaseBuffer(f.GetLength());
f.Close();
//declare our MSHTML variables and create a document
MSHTML::IHTMLDocument2Ptr pDoc;
MSHTML::IHTMLDocument3Ptr pDoc3;
MSHTML::IHTMLElementCollectionPtr pCollection;
MSHTML::IHTMLElementPtr pElement;
HRESULT hr = CoCreateInstance(CLSID_HTMLDocument, NULL,
CLSCTX_INPROC_SERVER,
IID_IHTMLDocument2, (void**)&pDoc);
//put the code into SAFEARRAY and write it into document
SAFEARRAY* psa = SafeArrayCreateVector(VT_VARIANT, 0, 1);
VARIANT *param;
bstr_t bsData = (LPCTSTR)csWholeFile;
hr = SafeArrayAccessData(psa, (LPVOID*)¶m);
param->vt = VT_BSTR;
param->bstrVal = (BSTR)bsData;
hr = pDoc->write(psa);
hr = pDoc->close();
SafeArrayDestroy(psa);
pDoc3 = pDoc;
pDoc->get_all(&pCollection);
pCollection = pDoc3->getElementsByTagName("table");
for(long i=0; i<pCollection->length; i++){
pElement = pCollection->item(i, (long)0);
if(pElement != NULL){
//m_wndLinksList.AddString("Hello");
m_wndLinksList.AddString((LPCTSTR)bstr_t(pElement->getAttribute("table"),
10));
}
}
}
}
url:http://www.ureader.com/gp/1452-1.aspx |
|
|
| Back to top |
|
|
|
|
|
All times are GMT - 5 Hours
The time now is Sat Nov 21, 2009 9:40 pm
|
|