PdfExtractor

PdfExtractor

Overview

The PdfExtractor feature allows users to upload PDF documents, extract data from them, and save the extracted data to a database. This functionality is implemented across multiple layers of the application, including the API, models, services, and web pages.

Details of Files & Method Names

Details of Files & Method Names
Functionalities
- Upload PDF, Convert to Base64, Call API, Store Extracted Data
File Names
Model & Class
- PdfExtractor.cs
Repository
- IPdfExtractorRepository.cs, PdfExtractorRepository.cs
Service
- IPdfExtractorService.cs, PdfExtractorService.cs
Controller
- PdfExtractorController.cs
Razor Pages
- PdfExtractorUpload.razor
Page Directive
- pdfextractorupload
Method Names
- ExtractPdfData, InsertExtractedPdfData, PdfExtractorUploadPage

App Settings (WEB)

  "PdfExtractor":
  {
    "ApiUri" : "https://dd3842sby11lo.cloudfront.net/get-data"
  }

Razor Page

@page "/pdfextractorupload"
@inject IPdfExtractorService _IPdfExtractorService

<MudText Typo="Typo.h4" Align="Align.Center">Create GRN/Bill through AI (Preview)</MudText>

<MudPaper Class="pa-6 mx-auto max-w-500 d-flex flex-column align-center elevation-3 rounded-lg" Style="margin-top:40px;">
    <EditForm Model="@_PdfExtractor" OnValidSubmit="PdfExtractorUploadPage">
        <DataAnnotationsValidator />

        <MudTextField @bind-Value="_PdfExtractor.DocNo" Label="Document No." Variant="Variant.Outlined"
        Adornment="Adornment.Start" AdornmentIcon="@Icons.Material.Filled.Description"
        For="@(() => _PdfExtractor.DocNo)" Class="mb-4 w-100" />

        <MudPaper Class="pa-4 mb-3 rounded-lg border-dashed border-2 border-primary d-flex flex-row align-center justify-center" Style="min-height:120px;">
            <MudFileUpload T="IBrowserFile" For="@(() => uploadedFile)" @bind-Files="uploadedFile" Accept=".pdf" MaximumFileCount="1">
                <ButtonTemplate Context="file">
                    <MudButton HtmlTag="label" Variant="Variant.Filled" Color="Color.Tertiary"
                            for="@file" StartIcon="@Icons.Material.Filled.AttachFile"
                            Disabled="@isProcessing" Class="mb-2">
                        Choose PDF
                    </MudButton>
                </ButtonTemplate>
                <SelectedTemplate Context="file">
                    @if (file != null)
                    {
                        <MudChip Color="Color.Primary" Variant="Variant.Outlined" StartIcon="@Icons.Material.Filled.PictureAsPdf">
                            @file.Name
                        </MudChip>
                    }
                    else
                    {
                        <MudText Class="text-muted">No file selected</MudText>
                    }
                </SelectedTemplate>
            </MudFileUpload>
        </MudPaper>

        @if (isProcessing)
        {
            <div class="d-flex align-center my-2">
                <MudProgressCircular Indeterminate="true" Color="Color.Primary" Class="mr-2" />
                <MudText>Extracting data please wait</MudText>
            </div>
        }

        <MudButton ButtonType="ButtonType.Submit" Variant="Variant.Filled" Color="Color.Primary"
        StartIcon="@Icons.Material.Filled.Save" Disabled="@isProcessing"
        Class="mt-3 w-100 py-2">Submit</MudButton>
    </EditForm>
</MudPaper>

@code {
    private bool isProcessing = false;
    private PdfExtractor _PdfExtractor = new();
    private PostLogin _PostLogin = new();
    IBrowserFile? uploadedFile;

    protected override async Task OnInitializedAsync()
    {
        _PostLogin = await _LocalSession.GetItemAsync<PostLogin>("userinfo");

        var rolecnt = _IPostLoginService.GetUserRolesCnt(_PostLogin.userdb, "PdfDataExtractor", _PostLogin.dbname);
        var usersessionid = _IPostLoginService.GetUserSessionId(_PostLogin.userdb);
        usersessionid = usersessionid.Replace("\"", "");

        if (_PostLogin.BrowserSessionId != usersessionid)
        {
            NavManager.NavigateTo("/invaliduser/5");
        }
        else if (rolecnt == 0)
        {
            NavManager.NavigateTo("/accessdenied/1");
        }
    }

    private async Task PdfExtractorUploadPage()
    {
        if (uploadedFile == null)
        {
            _ISnackbar.Add("No file uploaded.", Severity.Warning);
            return;
        }

        isProcessing = true;
        // Validate file size
        if (uploadedFile.Size > 5 * 1024 * 1024) // 5 MB limit
        {
            isProcessing = false;
            _ISnackbar.Add("File size exceeds 5 MB limit.", Severity.Error);
            return;
        }

        // Save the PDF file
        Stream stream = uploadedFile.OpenReadStream(5 * 1024 * 1024);
        string pathdirname2 = $"\\MFGReports\\Docs\\PdfExtractor";
        if (!Directory.Exists(pathdirname2))
        {
            Directory.CreateDirectory(pathdirname2);
        }
        var pathfilename2 = Path.GetFullPath(pathdirname2);
        var extension = Path.GetExtension(uploadedFile.Name);
        _PdfExtractor.YearDocNo = $"{_PostLogin.yearlabel.Replace("-", "")}{_PdfExtractor.DocNo}";
        var savefilepath = $"{pathfilename2}\\{_PdfExtractor.YearDocNo}{extension}";
        FileStream fs = File.Create(savefilepath);
        await stream.CopyToAsync(fs);
        stream.Close();
        fs.Close();

        // Save PDF data
        await _IPdfExtractorService.ExtractPdfData(_PdfExtractor, _PostLogin.dbname);
        _ISnackbar.Add("PDF data saved successfully.", Severity.Success);
        isProcessing = false;
        NavManager.NavigateTo("/pdfextractorupload", true);
    }
}

Controller

using ErpCrystal_MFG.Api.Repositories;
using Microsoft.AspNetCore.Mvc;
using System.Threading.Tasks;
using ErpCrystal_MFG.Models;

namespace ErpCrystal_MFG.Api.Controllers
{
    [Route("api/[controller]/[action]")]
    [ApiController]
    public class PdfExtractorController(IPdfExtractorRepository pdfscannerrepository) : ControllerBase
    {
        private readonly IPdfExtractorRepository _PdfExtractorRepository = pdfscannerrepository;

        [HttpPost("{dbname}")]
        public async Task<IActionResult> InsertExtractedPdfData(PdfExtractor _PdfExtractor, string dbname)
        {
            await _PdfExtractorRepository.InsertExtractedPdfData(_PdfExtractor.YearDocNo, dbname, _PdfExtractor.JsonData);
            return Ok();
        }
    }
}

Repository Interface

namespace ErpCrystal_MFG.Api.Repositories
{
    public interface IPdfExtractorRepository
    {
        Task InsertExtractedPdfData(string yearDocNo, string dbname, string jsonData);
    }
}

Repository Implementation

using Dapper;
using ErpCrystal_MFG.Api.Context;

namespace ErpCrystal_MFG.Api.Repositories
{
    public class PdfExtractorRepository(DapperContext dappercontext) : IPdfExtractorRepository
    {
        private readonly DapperContext _DapperContext = dappercontext;

        public async Task InsertExtractedPdfData(string yearDocNo, string dbname, string jsonData)
        {
            var query = "INSERT INTO PdfExtractedData (YearDocNo, JsonData) VALUES (@YearDocNo, @JsonData);";

            using var connection = _DapperContext.SetClientConnection(dbname);
            await connection.ExecuteAsync(query,
            new
            {
                yearDocNo,
                jsonData
            });
        }
    }
}

Service Interface

using ErpCrystal_MFG.Models;

namespace ErpCrystal_MFG.Web.Services
{
    public interface IPdfExtractorService
    {
        Task ExtractPdfData(PdfExtractor _PdfExtractor, string dbname);
    }
}

Service Implementation

using System.Text;
using ErpCrystal_MFG.Models;

namespace ErpCrystal_MFG.Web.Services
{
    public class PdfExtractorService : IPdfExtractorService
    {
        private readonly IHttpClientFactory _httpClientFactory;
        private readonly HttpClient _HttpClient;
        private readonly string CloudFrontEndpoint;

        public PdfExtractorService(IHttpClientFactory httpClientFactory, IConfiguration config)
        {
            _httpClientFactory = httpClientFactory;
            _HttpClient = _httpClientFactory.CreateClient("mfgapi");
            CloudFrontEndpoint = config["PdfExtractor:ApiUri"] ?? "";
        }

        public async Task ExtractPdfData(PdfExtractor _PdfExtractor, string dbname)
        {

            // Load the PDF file
            string pathdirname2 = $"\\MFGReports\\Docs\\PdfExtractor";
            var savefilepath = $"{Path.GetFullPath(pathdirname2)}\\{_PdfExtractor.YearDocNo}.pdf";

            if (!File.Exists(savefilepath))
            {
                throw new FileNotFoundException($"PDF file not found at path: {savefilepath}");
            }

            var pdfData = await File.ReadAllBytesAsync(savefilepath);
            // Convert PDF byte array to Base64 string
            var base64String = Convert.ToBase64String(pdfData);

            // Create JSON payload
            var payload = new
            {
                query = "Extract fields from this invoice",
                file_bytes = base64String,
                role = "Human"
            };

            // Create a new HttpClient instance for the CloudFront endpoint
            using var httpClient = _httpClientFactory.CreateClient();

            // Serialize the payload to JSON and create StringContent
            var jsonPayload = System.Text.Json.JsonSerializer.Serialize(payload);
            var content = new StringContent(jsonPayload, Encoding.UTF8, "application/json");

            // Send POST request to CloudFront endpoint
            var response = await httpClient.PostAsync(CloudFrontEndpoint, content);

            if (!response.IsSuccessStatusCode)
            {
                var errorContent = await response.Content.ReadAsStringAsync();
                throw new Exception($"CloudFront request failed: {response.StatusCode} - {errorContent}");
            }

            // Read the JSON response from CloudFront
            var jsonData = await response.Content.ReadAsStringAsync();
            _PdfExtractor.JsonData = jsonData;

            // Send POST request with request body
            var apiResponse = await _HttpClient.PostAsJsonAsync($"/api/pdfscanner/insertextractedpdfdata/{dbname}", _PdfExtractor);
            File.Delete(savefilepath);

        }
    }
}

Model

using System.ComponentModel.DataAnnotations;

namespace ErpCrystal_MFG.Models;
{
    public class PdfExtractor
    {
        [Required(ErrorMessage ="Doc No needs to be entered.")]
        public string DocNo { get; set; } = string.Empty;
        public string JsonData { get; set; } = string.Empty;
        public string YearDocNo { get; set; } = string.Empty;
    }
}

SQL Query

The SQL query used to insert data into the database:

CREATE TABLE PdfExtractedData ( 
    Id INT IDENTITY(1,1) PRIMARY KEY,
    YearDocNo NVARCHAR(13),
    JsonData NVARCHAR(MAX)
);
INSERT INTO PdfExtractedData (YearDocNo, JsonData) VALUES (@YearDocNo, @JsonData);

Summary

The PdfExtractor functionality is implemented across multiple layers of the application, with clear separation of concerns between the model, controller, repository, service, and UI components. The process involves uploading a PDF, extracting data from it using a CloudFront endpoint, and saving the extracted data to a database.