如何在网页中访问嵌入的PDF文件

本文关键字:PDF 文件 访问 网页 | 更新日期: 2023-09-27 18:36:37

当我下载一个嵌入pdf文件的url时,我可以访问所有页面html,而不是pdf文件本身。 我尝试过HttpWebRequest,WebClient,HtmlAgilityPack,memorystreams等。 不确定什么路径可以工作。 这是我得到的。 任何帮助,不胜感激。

string url = "http://emaps.emapsplus.com/rdl/MadisonCoAl/MadisonCoAl.aspx?showimg=yes&pid=1701013003029000";
byte[] result;
byte[] buffer = new byte[4096];
WebRequest wr = WebRequest.Create(url);
using (WebResponse response = wr.GetResponse())
{
    using (Stream responseStream = response.GetResponseStream())
    {
        using (MemoryStream memoryStream = new MemoryStream())
        {
            int count = 0;
            do
            {
                count = responseStream.Read(buffer, 0, buffer.Length);
                memoryStream.Write(buffer, 0, count);
            } while (count != 0);
            result = memoryStream.ToArray();
            File.WriteAllBytes(@"C:'testpdf.pdf", result);
       }
    }
}

如何在网页中访问嵌入的PDF文件

这个真的很棘手,因为它实际上不是正在下载的 pdf 文件。如果代码是普通的pdf,您的代码将起作用。这是一个运行一些javascript的网页,该脚本会回发到自身以生成pdf。

我有您当前问题的答案,但是如果您需要对许多文件执行此操作,则可能还有很长的路要走。为了使它正常工作,我通过 Fiddler 运行页面以获取它正在回发给自身的帖子字符串,然后我使用 C# 模拟该过程并将结果保存为 pdf。

这很好用,但问题是,如果没有通过 Fiddler 获取帖子字符串的手动步骤,您基本上必须创建自己的 Web 浏览器来理解所有 javascript 代码并执行它们以找出字符串是如何生成的。

    using System;
    using System.Collections.Generic;
    using System.ComponentModel;
    using System.Data;
    using System.Drawing;
    using System.IO;
    using System.Linq;
    using System.Net;
    using System.Text;
    using System.Threading.Tasks;
    using System.Windows.Forms;
namespace WindowsFormsApplication1
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }
        private void Form1_Load(object sender, EventArgs e)
        {
            string sURL = "http://emaps.emapsplus.com/rdl/MadisonCoAl/MadisonCoAl.aspx?showimg=yes&pid=1701013003029000";
            string sSource = "";
            byte[] buffer = new byte[4096];
            WebRequest wr = WebRequest.Create(sURL);
            using (WebResponse response = wr.GetResponse())
            {
                using (Stream responseStream = response.GetResponseStream())
                {
                    using (MemoryStream memoryStream = new MemoryStream())
                    {
                        int count = 0;
                        do
                        {
                            count = responseStream.Read(buffer, 0, buffer.Length);
                            memoryStream.Write(buffer, 0, count);
                        } while (count != 0);
                        sSource = System.Text.Encoding.UTF8.GetString(memoryStream.ToArray());
                    }
                }
            }
            if (!string.IsNullOrEmpty(sSource))
            {
                const string sQuoteString = "'"";   // If the values are not being found it could be because the markup is being output with single quotes change this variable from "'"" to "'" in that case
                const string sViewStateString = "__VIEWSTATE";
                const string sEventValidationString = "__EVENTVALIDATION";
                const string sValueString = "value=" + sQuoteString;
                Int32 nIndex1 = sSource.IndexOf(sViewStateString);
                Int32 nIndex2 = default(Int32);
                bool bFoundValues = false;
                string sViewState = "";
                string sEventValidation = "";
                // Look for the view state and event validation tags and grab the values
                // Without these values we cannot continue
                if (nIndex1 > -1)
                {
                    nIndex2 = sSource.IndexOf(sValueString, nIndex1);
                    if (nIndex2 > -1)
                    {
                        nIndex1 = sSource.IndexOf(sQuoteString, nIndex2 + sValueString.Length);
                        if (nIndex1 > -1)
                        {
                            sViewState = sSource.Substring(nIndex2 + sValueString.Length, nIndex1 - nIndex2 - sValueString.Length);
                            nIndex1 = sSource.IndexOf(sEventValidationString);
                            if (nIndex1 > -1)
                            {
                                nIndex2 = sSource.IndexOf(sValueString, nIndex1);
                                if (nIndex2 > -1)
                                {
                                    nIndex1 = sSource.IndexOf(sQuoteString, nIndex2 + sValueString.Length);
                                    if (nIndex1 > -1)
                                    {
                                        sEventValidation = sSource.Substring(nIndex2 + sValueString.Length, nIndex1 - nIndex2 - sValueString.Length);
                                        bFoundValues = true;
                                    }
                                }
                            }
                        }
                    }
                }
                if (bFoundValues == true)
                {
                    Int32 nTimeout = 30;
                    HttpWebRequest oRequest = HttpWebRequest.Create(new Uri(sURL).AbsoluteUri) as HttpWebRequest;
                    string sPostData = "__EVENTTARGET=btnPageLoad&__EVENTARGUMENT=&__VIEWSTATE=" + System.Web.HttpUtility.UrlEncode(sViewState) + "&__EVENTVALIDATION=" + System.Web.HttpUtility.UrlEncode(sEventValidation) + "&hdnLoaded=false&ReportViewer1%24ctl03%24ctl00=&ReportViewer1%24ctl03%24ctl01=&ReportViewer1%24ctl11=&ReportViewer1%24ctl12=standards&ReportViewer1%24AsyncWait%24HiddenCancelField=False&ReportViewer1%24ToggleParam%24store=&ReportViewer1%24ToggleParam%24collapse=false&ReportViewer1%24ctl09%24ClientClickedId=&ReportViewer1%24ctl08%24store=&ReportViewer1%24ctl08%24collapse=false&ReportViewer1%24ctl10%24VisibilityState%24ctl00=Error&ReportViewer1%24ctl10%24ScrollPosition=&ReportViewer1%24ctl10%24ReportControl%24ctl02=&ReportViewer1%24ctl10%24ReportControl%24ctl03=&ReportViewer1%24ctl10%24ReportControl%24ctl04=100";
                    byte[] oPostDataBuffer = System.Text.Encoding.ASCII.GetBytes(sPostData);
                    oRequest.UserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0";
                    oRequest.Timeout = nTimeout * 1000;
                    oRequest.Method = "POST";
                    oRequest.ContentType = "application/x-www-form-urlencoded";
                    oRequest.ContentLength = oPostDataBuffer.Length;
                    using (Stream oRequestStream = oRequest.GetRequestStream())
                    {
                        oRequestStream.Write(oPostDataBuffer, 0, oPostDataBuffer.Length);
                        oRequestStream.Close();
                    }
                    HttpWebResponse oResponse = oRequest.GetResponse() as HttpWebResponse;
                    if (oResponse.StatusCode != HttpStatusCode.OK)
                    {
                        // Error
                        MessageBox.Show(oResponse.StatusCode.ToString());
                    }
                    else
                    {
                        // Status is OK
                        byte[] oBuffer = null;
                        byte[] oFile = null;
                        using (BinaryReader reader = new BinaryReader(oResponse.GetResponseStream()))
                        {
                            using (MemoryStream oMemoryStream = new MemoryStream())
                            {
                                oBuffer = reader.ReadBytes(1024);
                                while (oBuffer.Length > 0)
                                {
                                    oMemoryStream.Write(oBuffer, 0, oBuffer.Length);
                                    oBuffer = reader.ReadBytes(1024);
                                }
                                oFile = new byte[Convert.ToInt32(Math.Floor(Convert.ToDouble(oMemoryStream.Length)))];
                                oMemoryStream.Position = 0;
                                oMemoryStream.Read(oFile, 0, oFile.Length);
                            }
                        }
                        using (FileStream oFileStream = new FileStream("C:''testpdf.pdf", FileMode.Create))
                        {
                            oFileStream.Write(oFile, 0, oFile.Length);
                        }
                    }
                    MessageBox.Show("PDF downloaded to C:''testpdf.pdf");
                }
                else
                {
                    MessageBox.Show("Cannot find the __VIEWSTATE and/or __EVENTVALIDATION variables.");
                }
            }
            else
            {
                MessageBox.Show("Cannot find source code for original url.");
            }
        }
    }
}

更新:

很可能有某种会话涉及发布数据,并且在您能够测试它之前它已经超时。因此,这意味着我们必须变得更有创造力。我承认这是一些反复试验,这段代码确实是为这个 url 量身定制的,可能适用于也可能不适用于同一网站上的其他 pdf。通过将我之前发布的 sPostData 字符串与我现在刚刚通过 Fiddler 代理运行网站获取的新字符串进行比较,我发现发布的许多变量中只有 2 个发生了变化。这两个变量都可以在可从原始 C# 代码生成的 html 源代码中找到。所以我们所做的只是一点字符串操作,获取这些变量的副本,然后做我发布的原始代码。看?我们现在正在一起工作!此更新的代码现在应该每次都有效,而不会给出 500 内部服务器错误消息。

注意:由于我们发布的数据尚未针对 Web 正确编码,因此我们必须包含对 system.web 的引用才能访问 urlencode 方法。为此,您需要:

    右键单击"解决方案资源管理器"中的"引用"
  1. ,然后选择"添加引用"
  2. 单击左侧的"程序集",然后在"框架"部分找到"System.Web"或使用最右侧的搜索框
  3. 在"System.Web"旁边打勾,然后单击"确定"

下面是用于简化回发数据捕获

的代码
        var postData = new System.Collections.Generic.List<string>();
        var document = new HtmlWeb().Load(url);
        foreach (var input in document.DocumentNode.SelectNodes("//input[@type='hidden']"))
        {
            var name = input.GetAttributeValue("name", "");
            name = Uri.EscapeDataString(name);
            var value = input.GetAttributeValue("value", "");
            value = Uri.EscapeDataString(value);
            if (name == "__EVENTTARGET")
            {
                value = "btnPageLoad";
            }
            postData.Add(string.Format("{0}={1}", name, value));
        }
        // Use StringBuilder for concatenation 
        System.Text.StringBuilder sb = new System.Text.StringBuilder(postData[0]);
        for (int i = 1; i < postData.Count; i++)
        {
            sb.Append("&");
            sb.Append(postData[i]);
        }
        var postBody = sb.ToString();
string sPDFPath= "FULL PATH";
  WebClient User = new WebClient();
                Byte[] FileBuffer = User.DownloadData(sPDFPath);
                if (FileBuffer != null)
                {
                    Response.ContentType = "application/pdf";
                    Response.AddHeader("content-length", FileBuffer.Length.ToString());
                    Response.BinaryWrite(FileBuffer);
                }