SOL9 2.0 Class: HTMLParser

 SOL9 C++ Class Library  SOL9 Samples  SOL9 Tutorial  SOL9 FAQ  SOL9 ClassTree  SOL9 ClassList 

Source code

/******************************************************************************
 *
 * Copyright (c) 2008 Antillia.com TOSHIYUKI ARAI. ALL RIGHTS RESERVED. 
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions, and the following disclaimer.
 *  
 * 2. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 *
 *  HTMLParser.h
 *
 *****************************************************************************/

//2008/09/30 Simple HTMLParser based on IHTMLDocument2 interface.
//2012/02/16 Updated loadString method.
//2012/02/17 Modified to include <sol/Object.h> and not to import <mshtml.tlb>

// See the the following code.
// http://209.85.175.104/search?
// q=cache:zDWb5JtAMfIJ:www.ddj.com/architect/184411019+HTMLParser+IOleClientSite&hl=ja&ct=clnk&cd=3&gl=jp

//See also the following url.
//1: http://www.codeproject.com/KB/IP/parse_html.aspx?
//fid=3219&df=90&mpp=25&noise=3&sort=Position&view=Quick&fr=76&select=249106


#pragma once
#include <sol/Object.h>
#include <sol/Exception.h> //2015/07/15

#include <mshtmdid.h>
#include <docobj.h> 

#include <sol/Wchardef.h>

//namespace = MSHTML
//2012/02/17 #import "mshtml.tlb"  named_guids

#define WM_HTML_COMPLETE   WM_USER+2009

//
namespace SOL
{

/**
 * class HTMLParser without GUI of IE.
 */
class HTMLParser: public IPropertyNotifySink, 
        IOleClientSite, IDispatch {

private:
  // Member variables
  DWORD   dwRef;
  DWORD    dwCookie;
  IHTMLDocument2* pHTMLDocument;
  IConnectionPoint* pCP;

public:
  /**
   * Construtor
   */
  HTMLParser()
    :dwRef(1), 
      dwCookie(0),
      pHTMLDocument(NULL),
      pCP(NULL)
  {
    HRESULT hr = S_OK;
   
    try {
      //Create an instance of IHTMLDocument2, because we don't need IE UI.
      if (FAILED(hr = CoCreateInstance( CLSID_HTMLDocument, NULL, 
          CLSCTX_INPROC_SERVER, IID_IHTMLDocument2, (LPVOID*)&pHTMLDocument ))) {

        throw Exception(E_FAIL, "%s: %s", "E_FAIL",__FUNCTION__);//Failed to create MSHTMLDocument;    
      }

      LPOLEOBJECT pOleObject = NULL;

      if (FAILED(hr = pHTMLDocument->QueryInterface(IID_IOleObject, 
                                                   (LPVOID*)&pOleObject))) {
        throw Exception(E_FAIL, "%s: %s", "E_FAIL",__FUNCTION__);//Failed to query OleObject;    
      }
  
      hr = pOleObject->SetClientSite((IOleClientSite*)this);
      pOleObject->Release();

      //OleControl
      LPOLECONTROL pOleControl = NULL;

      if (FAILED(hr = pHTMLDocument->QueryInterface(IID_IOleControl, 
                                                   (LPVOID*)&pOleControl))) {

        throw Exception(E_FAIL, "%s: %s", "E_FAIL",__FUNCTION__);//Failed to query ConnectionPoint;    
      }
  
      hr = pOleControl->OnAmbientPropertyChange(DISPID_AMBIENT_DLCONTROL);
      pOleControl->Release();
    
      IConnectionPointContainer* tmppCPC = NULL;

      if (FAILED(hr = pHTMLDocument->QueryInterface(IID_IConnectionPointContainer,
        (LPVOID*)&tmppCPC))) {
        throw Exception(E_FAIL, "%s: %s", "E_FAIL",__FUNCTION__);//Failed to query ConnectionPoint;    
      }

      if (FAILED(hr = tmppCPC->FindConnectionPoint(IID_IPropertyNotifySink, 
                        &pCP))){
        tmppCPC->Release();
        throw hr  ;//Failed to find ConnectionPoint;    
      }
      tmppCPC->Release();

      if (FAILED(hr = pCP->Advise((LPUNKNOWN)(IPropertyNotifySink*)this, 
          &dwCookie))) {
        throw Exception(E_FAIL, "%s: %s", "E_FAIL",__FUNCTION__);  //Failed to Advice;    
      }
    } catch (HRESULT hr) {
      clear();
      throw Exception(E_FAIL, "%s: %s HRESULT(%ld)", "E_FAIL",__FUNCTION__, hr);
    }
  }

public:
  /**
   * Destructor
   */
  ~HTMLParser() 
  {
    clear();
  }

private:
  void clear() 
  {
    if (pCP) {
      pCP->Unadvise(dwCookie);
      pCP->Release();
    }
    if (pHTMLDocument) {
      pHTMLDocument->Release();
    }

    pCP = NULL;
    pHTMLDocument = NULL;
  }

public:

  STDMETHODIMP QueryInterface(REFIID riid, LPVOID* ppv) 
  {
    *ppv = NULL;

    if (IID_IUnknown == riid || IID_IPropertyNotifySink == riid) {
      *ppv = (LPUNKNOWN)(IPropertyNotifySink*)this;
      AddRef();
      return NOERROR;
    } else if (IID_IOleClientSite == riid) {
      *ppv = (IOleClientSite*)this;
      AddRef();
      return NOERROR;
    } else if (IID_IDispatch == riid) {
      *ppv = (IDispatch*)this;
      AddRef();
      return NOERROR;
    } else {
      return E_NOTIMPL;
    }
  }


public:
  STDMETHOD_(ULONG, AddRef)() 
  {
    return ++dwRef;
  }

public:
  STDMETHOD_(ULONG, Release)() 
  {
    if (--dwRef == 0) { 
      delete this; 
      return 0; 
    }
    return dwRef;
  }

public:
  STDMETHODIMP OnChanged(DISPID dispID) 
  {
    HRESULT hr;
  
    if (DISPID_READYSTATE == dispID){
      VARIANT vResult = {0};
      EXCEPINFO excepInfo;
      UINT uArgErr;
      long lReadyState;
      DISPPARAMS dp = {NULL, NULL, 0, 0};
      
      if (SUCCEEDED(hr = pHTMLDocument->Invoke(DISPID_READYSTATE, IID_NULL, 
                          LOCALE_SYSTEM_DEFAULT, DISPATCH_PROPERTYGET, 
                          &dp, &vResult, &excepInfo, &uArgErr))) {
        lReadyState = (READYSTATE)V_I4(&vResult);

        switch (lReadyState) {   
        case READYSTATE_UNINITIALIZED:
        case READYSTATE_LOADING: 
        case READYSTATE_LOADED: 
        case READYSTATE_INTERACTIVE:
          break;
        case READYSTATE_COMPLETE: 
          printf("ReadState_Complete\n");
          // IE has just finished parsing the HTML file(READYSTATE_COMPLETE).
          // Post a WM_HTML_COMPLETE message to this current thread.
          
          PostThreadMessage(GetCurrentThreadId(),
                                WM_HTML_COMPLETE, (WPARAM)0, (LPARAM)0);
          break;
        }
        VariantClear(&vResult);
      }
    }
    return NOERROR;
  }


public:
  /** 
   * If DISPID== DISPID_AMBIENT_DLCONTROL, set the following flags to return value, 
   * because this is a UI-less HTML Parser.
   *
   *  DLCTL_DOWNLOADONLY | 
   *  DLCTL_NO_SCRIPTS |    
   *  DLCTL_NO_JAVA |      
   *  DLCTL_NO_DLACTIVEXCTLS |  
   *  DLCTL_NO_RUNACTIVEXCTLS;
   */
  STDMETHODIMP Invoke(DISPID dispIdMember, REFIID riid, LCID lcid,
            WORD wFlags, DISPPARAMS __RPC_FAR *pDispParams,
            VARIANT __RPC_FAR *pVarResult, EXCEPINFO __RPC_FAR *pExcepInfo,
            UINT __RPC_FAR *puArgErr) 
  {
    if (!pVarResult){
      return E_POINTER;
    }
    
    switch(dispIdMember) {

    case DISPID_AMBIENT_DLCONTROL:
      printf("HTMLParser::Invoke,DISPID_AMBIENT_DLCONTROL\n");
      V_VT(pVarResult) = VT_I4;
      
      V_I4(pVarResult) =  DLCTL_DOWNLOADONLY | 
                            DLCTL_NO_SCRIPTS | 
                            DLCTL_NO_JAVA |
                            DLCTL_NO_DLACTIVEXCTLS |
                            DLCTL_NO_RUNACTIVEXCTLS;            
      break;

    default:
      return DISP_E_MEMBERNOTFOUND;
    }
    return NOERROR;
  }


protected:
  STDMETHODIMP OnRequestEdit(DISPID dispID) { 
    return NOERROR; 
  }
        
  // IOleClientSite methods
  STDMETHODIMP SaveObject(void) { 
    return E_NOTIMPL; 
  }

  STDMETHODIMP GetMoniker(DWORD dwAssign,
    DWORD dwWhichMoniker, IMoniker** ppmk){
    return E_NOTIMPL; 
  }

  STDMETHODIMP GetContainer(IOleContainer** ppContainer){
    return E_NOTIMPL; 
  }
        
  STDMETHODIMP ShowObject(void){ 
    return E_NOTIMPL; 
  }
     
  STDMETHODIMP OnShowWindow(BOOL fShow){ 
    return E_NOTIMPL; 
  }
    
    STDMETHODIMP RequestNewObjectLayout(void){ 
    return E_NOTIMPL; 
  }
  
  
  // IDispatch method
  STDMETHODIMP GetTypeInfoCount(UINT* pctinfo){ 
    return E_NOTIMPL; 
  }
     
  STDMETHODIMP GetTypeInfo(UINT iTInfo, LCID lcid, ITypeInfo** ppTInfo){ 
    return E_NOTIMPL; 
  }
    
  STDMETHODIMP GetIDsOfNames(REFIID riid, LPOLESTR* rgszNames,
                                UINT cNames, LCID lcid, DISPID* rgDispId){ 
    return E_NOTIMPL; 
  }

public:
  /**
   * Load HTML from an interface of IStream.
   */
  HRESULT loadFromStream(IStream* pStream)
  {
    HRESULT hr = E_FAIL;
    IPersistStreamInit* pPersistStreamInit = NULL;

    if (pHTMLDocument != NULL) {

      if (SUCCEEDED(pHTMLDocument->QueryInterface(IID_IPersistStreamInit, 
        (void**)&pPersistStreamInit))) {
      
        if (SUCCEEDED(pPersistStreamInit->InitNew())) {
      
          if (SUCCEEDED(hr = pPersistStreamInit->Load(pStream))) {
            wait();
          }
        }
        pPersistStreamInit->Release();
      }
    }
    return hr;
  }

public:
  /**
   * LoadFrom a string.
   */
  HRESULT loadFromString(const TCHAR* html)
  {  
    HRESULT hr = E_FAIL;
    if (html == NULL) {
      return hr;
    }
    //<Modified at="2012/02/16">
    int cchLength = strlen(html)+1;
    int nBytes = cchLength* sizeof(TCHAR);

    HGLOBAL hHTML = GlobalAlloc(GPTR, nBytes);

    if (hHTML){
      void* buff = GlobalLock(hHTML);
      memcpy(buff, html, nBytes);

      GlobalUnlock(hHTML);

      IStream* pStream = NULL;
      if (SUCCEEDED(hr = CreateStreamOnHGlobal(hHTML, FALSE, &pStream))) {
        //</Modified>
        ULARGE_INTEGER size;
        size.QuadPart = nBytes;
        pStream->SetSize(size);
        hr = loadFromStream(pStream);
        pStream->Release();
      }
      GlobalFree(hHTML);
    }
    return hr;
  }

public:
  /**
   * Load an HTML file something like iC:\root\htmo\sample.htmlj
   * 
   */
  HRESULT loadFromFile(const TCHAR* filePath) 
  { 
    HRESULT  hr =E_FAIL;
    if (filePath == NULL) {
      return hr;
    }

    if (GetFileAttributes(filePath) == 0xffffffff ){
      printf("File is access error\n");
      return hr;
    }
    
    _bstr_t bstrFile(filePath);
    
    IPersistFile* pPF = NULL;
    //Get an interface for IID_IPersistFile.
    if (SUCCEEDED(hr = pHTMLDocument->QueryInterface(IID_IPersistFile, 
          (LPVOID*) &pPF))) {

      //Load the content by using a IPersitentFile interface.
      if (SUCCEEDED(hr = pPF->Load((LPCWSTR)bstrFile, STGM_READ | STGM_SHARE_DENY_NONE))) {
        wait();
      } 
      pPF->Release();
    }
    return hr;
  }

public:
  /**
   */
  void wait() {
    MSG msg;
    while (GetMessage(&msg, NULL, 0, 0)) {
      // Notification from OnChanged
      if (WM_HTML_COMPLETE == msg.message && NULL == msg.hwnd) {
        printf("HTMLParser#wait,1,OK,loading completed\n");
        break;
      } else {
        DispatchMessage(&msg);
      }
    }
  }



public:
  IHTMLDocument2* getHTMLDocument() 
  {
    if (pHTMLDocument) {
      pHTMLDocument->AddRef();
    }
    return this->pHTMLDocument;
  }

}; 

}


Last modified: 5 May 2019

Copyright (c) 2009-2019 Antillia.com ALL RIGHTS RESERVED.