Skip to content

Commit f322873

Browse files
committed
added Azure Document Intelligence example to semantic search PDF file
1 parent e2028ff commit f322873

File tree

3 files changed

+173
-0
lines changed

3 files changed

+173
-0
lines changed
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
using Azure;
2+
using Azure.AI.DocumentIntelligence;
3+
using System;
4+
using System.Text;
5+
using System.IO;
6+
using System.Threading.Tasks;
7+
using Build5Nines.SharpVector;
8+
9+
// This sample demonstrates how to use the Document Intelligence client library to analyze a document using the prebuilt-read model.
10+
string endpoint = "https://<resource-name>.cognitiveservices.azure.com/";
11+
string apiKey = "<your-key>";
12+
string filePath = "document.pdf"; // Can be .pdf, .docx, .jpg, etc.
13+
14+
// Create timers to measure how long it takes to run the code
15+
var overallTimer = new System.Diagnostics.Stopwatch();
16+
var stepTimer = new System.Diagnostics.Stopwatch();
17+
overallTimer.Start();
18+
19+
20+
// Create a DocumentIntelligenceClient
21+
var credential = new AzureKeyCredential(apiKey);
22+
var client = new DocumentIntelligenceClient(new Uri(endpoint), credential);
23+
24+
var vdb = new BasicMemoryVectorDatabase();
25+
26+
27+
28+
29+
30+
// Read the file into a BinaryData object
31+
Console.WriteLine("Reading file...");
32+
stepTimer.Start();
33+
34+
using var stream = File.OpenRead(filePath);
35+
byte[] buffer = new byte[stream.Length];
36+
await stream.ReadAsync(buffer, 0, buffer.Length);
37+
var binaryData = BinaryData.FromBytes(buffer);
38+
39+
stepTimer.Stop();
40+
Console.WriteLine($"File loaded into memory: {stepTimer.ElapsedMilliseconds} ms");
41+
42+
Console.WriteLine("Analyzing document with Azure Document Intelligence...");
43+
stepTimer.Restart();
44+
45+
// Analyze the document using the prebuilt-read model
46+
var operation = await client.AnalyzeDocumentAsync(
47+
WaitUntil.Completed,
48+
"prebuilt-read",
49+
binaryData);
50+
51+
var docResult = operation.Value;
52+
53+
stepTimer.Stop();
54+
Console.WriteLine($"Document analysis completed: {stepTimer.ElapsedMilliseconds} ms");
55+
56+
stepTimer.Restart();
57+
Console.WriteLine("Loading SharpVector database...");
58+
59+
foreach (var page in docResult.Pages)
60+
{
61+
var sb = new StringBuilder();
62+
foreach (var line in page.Lines)
63+
{
64+
sb.AppendLine(line.Content);
65+
}
66+
67+
// Add the text to the vector database
68+
// Let's use the Page Number as the metadata
69+
// Note: In a real-world scenario, you might want to use more meaningful metadata
70+
var textMetadata = page.PageNumber.ToString();
71+
vdb.AddText(sb.ToString(), textMetadata);
72+
}
73+
74+
stepTimer.Stop();
75+
Console.WriteLine($"SharpVector database loaded: {stepTimer.ElapsedMilliseconds} ms");
76+
77+
78+
79+
80+
81+
// Console.WriteLine("");
82+
// Console.WriteLine("Loading PDF File into vector database...");
83+
// stepTimer.Restart();
84+
// // read pdf file with PdfPig locally
85+
// var vdb2 = new BasicMemoryVectorDatabase();
86+
// using (var pdfDocument = UglyToad.PdfPig.PdfDocument.Open(filePath))
87+
// {
88+
// foreach (var page in pdfDocument.GetPages())
89+
// {
90+
// // Add the text to the vector database
91+
// // Let's use the Page Number as the metadata
92+
// // Note: In a real-world scenario, you might want to use more meaningful metadata
93+
// var metadata = page.Number.ToString();
94+
// vdb.AddText(page.Text, metadata);
95+
// }
96+
// }
97+
// stepTimer.Stop();
98+
// Console.WriteLine($"Vector database loaded: {stepTimer.ElapsedMilliseconds} ms");
99+
100+
101+
102+
103+
104+
105+
106+
107+
Console.WriteLine("");
108+
Console.WriteLine("Searching in SharpVector database for \"Azure ML\" with similarity score > 0.5...");
109+
stepTimer.Restart();
110+
111+
var query = "Azure ML";
112+
var semanticResults = vdb.Search(
113+
query,
114+
threshold: 0.5f // Set a threshold for the similarity score to only match results above this value
115+
);
116+
117+
stepTimer.Stop();
118+
Console.WriteLine($"Search completed: {stepTimer.ElapsedMilliseconds} ms");
119+
120+
121+
Console.WriteLine("Top Matching Results:");
122+
foreach (var result in semanticResults.Texts)
123+
{
124+
//var text = result.Text;
125+
var metadata = result.Metadata;
126+
var similarity = result.VectorComparison;
127+
Console.WriteLine($" - Page: {metadata} - Similarity: {similarity}");
128+
}
129+
130+
131+
Console.WriteLine("");
132+
Console.WriteLine("Searching in SharpVector database for \"Why use a Cloud Adoption Framework strategy\", top 3 results...");
133+
stepTimer.Restart();
134+
135+
query = "Why use a Cloud Adoption Framework strategy";
136+
semanticResults = vdb.Search(
137+
query,
138+
pageCount: 3 // Set the number of top results to return
139+
);
140+
141+
stepTimer.Stop();
142+
Console.WriteLine($"Search completed: {stepTimer.ElapsedMilliseconds} ms");
143+
144+
145+
Console.WriteLine("Top Matching Results:");
146+
foreach (var result in semanticResults.Texts)
147+
{
148+
//var text = result.Text;
149+
var metadata = result.Metadata;
150+
var similarity = result.VectorComparison;
151+
Console.WriteLine($" - Page: {metadata} - Similarity: {similarity}");
152+
}
153+
154+
overallTimer.Stop();
155+
Console.WriteLine("");
156+
Console.WriteLine($"Overall processing time: {overallTimer.ElapsedMilliseconds} ms");
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<OutputType>Exe</OutputType>
5+
<TargetFramework>net9.0</TargetFramework>
6+
<RootNamespace>b59_azure_doc_intelligence</RootNamespace>
7+
<ImplicitUsings>enable</ImplicitUsings>
8+
<Nullable>enable</Nullable>
9+
</PropertyGroup>
10+
11+
<ItemGroup>
12+
<PackageReference Include="Azure.AI.DocumentIntelligence" Version="1.0.0" />
13+
<PackageReference Include="Build5nines.sharpvector" Version="2.1.1" />
14+
<PackageReference Include="PdfPig" Version="0.1.10" />
15+
</ItemGroup>
16+
17+
</Project>
Binary file not shown.

0 commit comments

Comments
 (0)