PdfExtractor
Overview
The PdfExtractor feature allows users to upload PDF documents, extract data from them, and save the extracted data to a database. This functionality is implemented across multiple layers of the application, including the API, models, services, and web pages.
Details of Files & Method Names
| Details of Files & Method Names |
|---|
| Functionalities - Upload PDF, Convert to Base64, Call API, Store Extracted Data |
| File Names Model & Class - PdfExtractor.cs Repository - IPdfExtractorRepository.cs, PdfExtractorRepository.cs Service - IPdfExtractorService.cs, PdfExtractorService.cs Controller - PdfExtractorController.cs Razor Pages - PdfExtractorUpload.razor |
| Page Directive - pdfextractorupload |
| Method Names - ExtractPdfData, InsertExtractedPdfData, PdfExtractorUploadPage |
App Settings (WEB)
"PdfExtractor":
{
"ApiUri" : "https://dd3842sby11lo.cloudfront.net/get-data"
}Razor Page
@page "/pdfextractorupload"
@inject IPdfExtractorService _IPdfExtractorService
<MudText Typo="Typo.h4" Align="Align.Center">Create GRN/Bill through AI (Preview)</MudText>
<MudPaper Class="pa-6 mx-auto max-w-500 d-flex flex-column align-center elevation-3 rounded-lg" Style="margin-top:40px;">
<EditForm Model="@_PdfExtractor" OnValidSubmit="PdfExtractorUploadPage">
<DataAnnotationsValidator />
<MudTextField @bind-Value="_PdfExtractor.DocNo" Label="Document No." Variant="Variant.Outlined"
Adornment="Adornment.Start" AdornmentIcon="@Icons.Material.Filled.Description"
For="@(() => _PdfExtractor.DocNo)" Class="mb-4 w-100" />
<MudPaper Class="pa-4 mb-3 rounded-lg border-dashed border-2 border-primary d-flex flex-row align-center justify-center" Style="min-height:120px;">
<MudFileUpload T="IBrowserFile" For="@(() => uploadedFile)" @bind-Files="uploadedFile" Accept=".pdf" MaximumFileCount="1">
<ButtonTemplate Context="file">
<MudButton HtmlTag="label" Variant="Variant.Filled" Color="Color.Tertiary"
for="@file" StartIcon="@Icons.Material.Filled.AttachFile"
Disabled="@isProcessing" Class="mb-2">
Choose PDF
</MudButton>
</ButtonTemplate>
<SelectedTemplate Context="file">
@if (file != null)
{
<MudChip Color="Color.Primary" Variant="Variant.Outlined" StartIcon="@Icons.Material.Filled.PictureAsPdf">
@file.Name
</MudChip>
}
else
{
<MudText Class="text-muted">No file selected</MudText>
}
</SelectedTemplate>
</MudFileUpload>
</MudPaper>
@if (isProcessing)
{
<div class="d-flex align-center my-2">
<MudProgressCircular Indeterminate="true" Color="Color.Primary" Class="mr-2" />
<MudText>Extracting data… please wait</MudText>
</div>
}
<MudButton ButtonType="ButtonType.Submit" Variant="Variant.Filled" Color="Color.Primary"
StartIcon="@Icons.Material.Filled.Save" Disabled="@isProcessing"
Class="mt-3 w-100 py-2">Submit</MudButton>
</EditForm>
</MudPaper>
@code {
private bool isProcessing = false;
private PdfExtractor _PdfExtractor = new();
private PostLogin _PostLogin = new();
IBrowserFile? uploadedFile;
protected override async Task OnInitializedAsync()
{
_PostLogin = await _LocalSession.GetItemAsync<PostLogin>("userinfo");
var rolecnt = _IPostLoginService.GetUserRolesCnt(_PostLogin.userdb, "PdfDataExtractor", _PostLogin.dbname);
var usersessionid = _IPostLoginService.GetUserSessionId(_PostLogin.userdb);
usersessionid = usersessionid.Replace("\"", "");
if (_PostLogin.BrowserSessionId != usersessionid)
{
NavManager.NavigateTo("/invaliduser/5");
}
else if (rolecnt == 0)
{
NavManager.NavigateTo("/accessdenied/1");
}
}
private async Task PdfExtractorUploadPage()
{
if (uploadedFile == null)
{
_ISnackbar.Add("No file uploaded.", Severity.Warning);
return;
}
isProcessing = true;
// Validate file size
if (uploadedFile.Size > 5 * 1024 * 1024) // 5 MB limit
{
isProcessing = false;
_ISnackbar.Add("File size exceeds 5 MB limit.", Severity.Error);
return;
}
// Save the PDF file
Stream stream = uploadedFile.OpenReadStream(5 * 1024 * 1024);
string pathdirname2 = $"\\MFGReports\\Docs\\PdfExtractor";
if (!Directory.Exists(pathdirname2))
{
Directory.CreateDirectory(pathdirname2);
}
var pathfilename2 = Path.GetFullPath(pathdirname2);
var extension = Path.GetExtension(uploadedFile.Name);
_PdfExtractor.YearDocNo = $"{_PostLogin.yearlabel.Replace("-", "")}{_PdfExtractor.DocNo}";
var savefilepath = $"{pathfilename2}\\{_PdfExtractor.YearDocNo}{extension}";
FileStream fs = File.Create(savefilepath);
await stream.CopyToAsync(fs);
stream.Close();
fs.Close();
// Save PDF data
await _IPdfExtractorService.ExtractPdfData(_PdfExtractor, _PostLogin.dbname);
_ISnackbar.Add("PDF data saved successfully.", Severity.Success);
isProcessing = false;
NavManager.NavigateTo("/pdfextractorupload", true);
}
}Controller
using ErpCrystal_MFG.Api.Repositories;
using Microsoft.AspNetCore.Mvc;
using System.Threading.Tasks;
using ErpCrystal_MFG.Models;
namespace ErpCrystal_MFG.Api.Controllers
{
[Route("api/[controller]/[action]")]
[ApiController]
public class PdfExtractorController(IPdfExtractorRepository pdfscannerrepository) : ControllerBase
{
private readonly IPdfExtractorRepository _PdfExtractorRepository = pdfscannerrepository;
[HttpPost("{dbname}")]
public async Task<IActionResult> InsertExtractedPdfData(PdfExtractor _PdfExtractor, string dbname)
{
await _PdfExtractorRepository.InsertExtractedPdfData(_PdfExtractor.YearDocNo, dbname, _PdfExtractor.JsonData);
return Ok();
}
}
}Repository Interface
namespace ErpCrystal_MFG.Api.Repositories
{
public interface IPdfExtractorRepository
{
Task InsertExtractedPdfData(string yearDocNo, string dbname, string jsonData);
}
}Repository Implementation
using Dapper;
using ErpCrystal_MFG.Api.Context;
namespace ErpCrystal_MFG.Api.Repositories
{
public class PdfExtractorRepository(DapperContext dappercontext) : IPdfExtractorRepository
{
private readonly DapperContext _DapperContext = dappercontext;
public async Task InsertExtractedPdfData(string yearDocNo, string dbname, string jsonData)
{
var query = "INSERT INTO PdfExtractedData (YearDocNo, JsonData) VALUES (@YearDocNo, @JsonData);";
using var connection = _DapperContext.SetClientConnection(dbname);
await connection.ExecuteAsync(query,
new
{
yearDocNo,
jsonData
});
}
}
}Service Interface
using ErpCrystal_MFG.Models;
namespace ErpCrystal_MFG.Web.Services
{
public interface IPdfExtractorService
{
Task ExtractPdfData(PdfExtractor _PdfExtractor, string dbname);
}
}Service Implementation
using System.Text;
using ErpCrystal_MFG.Models;
namespace ErpCrystal_MFG.Web.Services
{
public class PdfExtractorService : IPdfExtractorService
{
private readonly IHttpClientFactory _httpClientFactory;
private readonly HttpClient _HttpClient;
private readonly string CloudFrontEndpoint;
public PdfExtractorService(IHttpClientFactory httpClientFactory, IConfiguration config)
{
_httpClientFactory = httpClientFactory;
_HttpClient = _httpClientFactory.CreateClient("mfgapi");
CloudFrontEndpoint = config["PdfExtractor:ApiUri"] ?? "";
}
public async Task ExtractPdfData(PdfExtractor _PdfExtractor, string dbname)
{
// Load the PDF file
string pathdirname2 = $"\\MFGReports\\Docs\\PdfExtractor";
var savefilepath = $"{Path.GetFullPath(pathdirname2)}\\{_PdfExtractor.YearDocNo}.pdf";
if (!File.Exists(savefilepath))
{
throw new FileNotFoundException($"PDF file not found at path: {savefilepath}");
}
var pdfData = await File.ReadAllBytesAsync(savefilepath);
// Convert PDF byte array to Base64 string
var base64String = Convert.ToBase64String(pdfData);
// Create JSON payload
var payload = new
{
query = "Extract fields from this invoice",
file_bytes = base64String,
role = "Human"
};
// Create a new HttpClient instance for the CloudFront endpoint
using var httpClient = _httpClientFactory.CreateClient();
// Serialize the payload to JSON and create StringContent
var jsonPayload = System.Text.Json.JsonSerializer.Serialize(payload);
var content = new StringContent(jsonPayload, Encoding.UTF8, "application/json");
// Send POST request to CloudFront endpoint
var response = await httpClient.PostAsync(CloudFrontEndpoint, content);
if (!response.IsSuccessStatusCode)
{
var errorContent = await response.Content.ReadAsStringAsync();
throw new Exception($"CloudFront request failed: {response.StatusCode} - {errorContent}");
}
// Read the JSON response from CloudFront
var jsonData = await response.Content.ReadAsStringAsync();
_PdfExtractor.JsonData = jsonData;
// Send POST request with request body
var apiResponse = await _HttpClient.PostAsJsonAsync($"/api/pdfscanner/insertextractedpdfdata/{dbname}", _PdfExtractor);
File.Delete(savefilepath);
}
}
}Model
using System.ComponentModel.DataAnnotations;
namespace ErpCrystal_MFG.Models;
{
public class PdfExtractor
{
[Required(ErrorMessage ="Doc No needs to be entered.")]
public string DocNo { get; set; } = string.Empty;
public string JsonData { get; set; } = string.Empty;
public string YearDocNo { get; set; } = string.Empty;
}
}SQL Query
The SQL query used to insert data into the database:
CREATE TABLE PdfExtractedData (
Id INT IDENTITY(1,1) PRIMARY KEY,
YearDocNo NVARCHAR(13),
JsonData NVARCHAR(MAX)
);
INSERT INTO PdfExtractedData (YearDocNo, JsonData) VALUES (@YearDocNo, @JsonData);Summary
The PdfExtractor functionality is implemented across multiple layers of the application, with clear separation of concerns between the model, controller, repository, service, and UI components. The process involves uploading a PDF, extracting data from it using a CloudFront endpoint, and saving the extracted data to a database.