Sample code shows how to use the Apryse Server OCR module on scanned documents in multiple languages. The OCR module can make searchable PDFs and extract scanned text for further indexing. Learn more about our Server SDK.
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6using System;
7using pdftron;
8using pdftron.Common;
9using pdftron.SDF;
10using pdftron.PDF;
11
12namespace OCRTestCS
13{
14
15 /// <summary>
16 //---------------------------------------------------------------------------------------
17 // The following sample illustrates how to use OCR module
18 //---------------------------------------------------------------------------------------
19 /// </summary>
20 class Class1
21 {
22 private static pdftron.PDFNetLoader pdfNetLoader = pdftron.PDFNetLoader.Instance();
23 static Class1() {}
24
25 /// <summary>
26 /// The main entry point for the application.
27 /// </summary>
28 static void Main(string[] args)
29 {
30 // The first step in every application using PDFNet is to initialize the
31 // library and set the path to common PDF resources. The library is usually
32 // initialized only once, but calling Initialize() multiple times is also fine.
33 PDFNet.Initialize(PDFTronLicense.Key);
34
35 // Can optionally set path to the OCR module
36 PDFNet.AddResourceSearchPath("../../../../../Lib/");
37
38 // if the IRIS OCR module is available, will use that instead of the default
39 bool use_iris = OCRModule.IsIRISModuleAvailable();
40 if( !OCRModule.IsModuleAvailable() )
41 {
42 Console.WriteLine("");
43 Console.WriteLine("Unable to run OCRTest: Apryse SDK OCR module not available.");
44 Console.WriteLine("---------------------------------------------------------------");
45 Console.WriteLine("The OCR module is an optional add-on, available for download");
46 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this");
47 Console.WriteLine("module, ensure that the SDK is able to find the required files");
48 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
49 Console.WriteLine("");
50 return;
51 }
52
53 // Relative path to the folder containing test files.
54 string input_path = "../../../../TestFiles/OCR/";
55 string output_path = "../../../../TestFiles/Output/";
56
57 //--------------------------------------------------------------------------------
58 // Example 1) Process image
59 try
60 {
61
62 // A) Setup empty destination doc
63 using (PDFDoc doc = new PDFDoc())
64 {
65 // B) Set English as the language of choice
66 OCROptions opts = new OCROptions();
67 if(use_iris) opts.SetOCREngine("iris");
68 opts.AddLang("eng");
69
70 // C) Run OCR on the .png with options
71 OCRModule.ImageToPDF(doc, input_path + "psychomachia_excerpt.png", opts);
72
73 // D) check the result
74 doc.Save(output_path + "psychomachia_excerpt.pdf", SDFDoc.SaveOptions.e_remove_unused);
75
76 Console.WriteLine("Example 1: psychomachia_excerpt.png");
77 }
78
79 }
80 catch (PDFNetException e)
81 {
82 Console.WriteLine(e.Message);
83 }
84
85 //--------------------------------------------------------------------------------
86 // Example 2) Process document using multiple languages
87 try
88 {
89
90 // A) Setup empty destination doc
91 using (PDFDoc doc = new PDFDoc())
92 {
93
94 // B) Setup options with multiple target languages, English will always be considered as secondary language
95 OCROptions opts = new OCROptions();
96 if(use_iris) opts.SetOCREngine("iris");
97 opts.AddLang("deu");
98 opts.AddLang("fra");
99 opts.AddLang("eng");
100
101 // C) Run OCR on the .jpg with options
102 OCRModule.ImageToPDF(doc, input_path + "multi_lang.jpg", opts);
103
104 // D) check the result
105 doc.Save(output_path + "multi_lang.pdf", SDFDoc.SaveOptions.e_remove_unused);
106
107 Console.WriteLine("Example 2: multi_lang.jpg");
108 }
109
110 }
111 catch (PDFNetException e)
112 {
113 Console.WriteLine(e.Message);
114 }
115
116 //--------------------------------------------------------------------------------
117 // Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image
118 try
119 {
120
121 // A) Open the .pdf document
122 using (PDFDoc doc = new PDFDoc(input_path + "german_kids_song.pdf"))
123 {
124
125 // B) Setup options with a single language and an ignore zone
126 OCROptions opts = new OCROptions();
127 if(use_iris) opts.SetOCREngine("iris");
128 opts.AddLang("deu");
129
130 RectCollection ignoreZones = new RectCollection();
131 ignoreZones.AddRect(424, 163, 493, 730);
132 opts.AddIgnoreZonesForPage(ignoreZones, 1);
133
134 // C) Run OCR on the .pdf with options
135 OCRModule.ProcessPDF(doc, opts);
136
137 // D) check the result
138 doc.Save(output_path + "german_kids_song.pdf", SDFDoc.SaveOptions.e_remove_unused);
139
140 Console.WriteLine("Example 3: german_kids_song.pdf");
141 }
142
143 }
144 catch (PDFNetException e)
145 {
146 Console.WriteLine(e.Message);
147 }
148
149 //--------------------------------------------------------------------------------
150 // Example 4) Process multipage tiff with text/ignore zones specified for each page
151 try
152 {
153
154 // A) Setup empty destination doc
155 using (PDFDoc doc = new PDFDoc())
156 {
157
158 // B) Setup options with a single language plus text/ignore zones
159 OCROptions opts = new OCROptions();
160 if(use_iris) opts.SetOCREngine("iris");
161 opts.AddLang("eng");
162
163 RectCollection zones = new RectCollection();
164
165
166 // ignore signature box in the first 2 pages
167 zones.AddRect(1492, 56, 2236, 432);
168 opts.AddIgnoreZonesForPage(zones, 1);
169 zones.Clear();
170
171 zones.AddRect(1492, 56, 2236, 432);
172 opts.AddIgnoreZonesForPage(zones, 2);
173 zones.Clear();
174
175 // can use a combination of ignore and text boxes to focus on the page area of interest,
176 // as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
177 zones.AddRect(992, 1276, 1368, 1372);
178 opts.AddIgnoreZonesForPage(zones, 3);
179 zones.Clear();
180
181 // select horizontal BUFFER ZONE sign
182 zones.AddRect(900, 2384, 1236, 2480);
183 // select right vertical BUFFER ZONE sign
184 zones.AddRect(1960, 1976, 2016, 2296);
185 // select Lot No.
186 zones.AddRect(696, 1028, 1196, 1128);
187
188 // select part of the plan inside the BUFFER ZONE
189 zones.AddRect(428, 1484, 1784, 2344);
190 zones.AddRect(948, 1288, 1672, 1476);
191 opts.AddTextZonesForPage(zones, 3);
192
193 // C) Run OCR on the .pdf with options
194 OCRModule.ImageToPDF(doc, input_path + "bc_environment_protection.tif", opts);
195
196 // D) check the result
197 doc.Save(output_path + "bc_environment_protection.pdf", SDFDoc.SaveOptions.e_remove_unused);
198
199 Console.WriteLine("Example 4: bc_environment_protection.tif");
200 }
201
202 }
203 catch (PDFNetException e)
204 {
205 Console.WriteLine(e.Message);
206 }
207
208 //--------------------------------------------------------------------------------
209 // Example 5) Alternative workflow for extracting OCR result JSON, postprocessing (e.g., removing words not in the dictionary or filtering special
210 // out special characters), and finally applying modified OCR JSON to the source PDF document
211 try
212 {
213
214 // A) Open the .pdf document
215 using (PDFDoc doc = new PDFDoc(input_path + "zero_value_test_no_text.pdf"))
216 {
217
218 // B) set English language
219 OCROptions opts = new OCROptions();
220 if(use_iris) opts.SetOCREngine("iris");
221 opts.AddLang("eng");
222
223
224 // C) Run OCR on the .pdf
225 string json = OCRModule.GetOCRJsonFromPDF(doc, opts);
226
227 // D) Post-processing step (whatever it might be), but we just print JSON here
228 Console.WriteLine("Have OCR result JSON, re-applying to PDF");
229
230 // E) Apply potentially modified OCR JSON to the PDF
231 OCRModule.ApplyOCRJsonToPDF(doc, json);
232
233 // F) check the result
234 doc.Save(output_path + "zero_value_test_no_text.pdf", SDFDoc.SaveOptions.e_remove_unused);
235
236 Console.WriteLine("Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf");
237 }
238
239 }
240 catch (PDFNetException e)
241 {
242 Console.WriteLine(e.Message);
243 }
244
245 //--------------------------------------------------------------------------------
246 // Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format, similar to the one used by TextExtractor
247 try
248 {
249
250 // A) Setup empty destination doc
251 using (PDFDoc doc = new PDFDoc())
252 {
253
254 // B) set English language
255 OCROptions opts = new OCROptions();
256 if(use_iris) opts.SetOCREngine("iris");
257 opts.AddLang("eng");
258
259 // C) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
260 // in the process we convert the source image into PDF. We reuse this PDF document later to add hidden text layer to it.
261
262 string xml = OCRModule.GetOCRXmlFromImage(doc, input_path + "physics.tif", opts);
263
264 // D) Post-processing step (whatever it might be), but we just print XML here
265 Console.WriteLine("Have OCR result XML, re-applying to PDF");
266
267 // E) Apply potentially modified OCR XML to the PDF
268 OCRModule.ApplyOCRXmlToPDF(doc, xml);
269
270 // F) check the result
271 doc.Save(output_path + "physics.pdf", SDFDoc.SaveOptions.e_remove_unused);
272
273 Console.WriteLine("Example 6: extracting and applying OCR XML from physics.tif");
274 }
275
276 }
277 catch (PDFNetException e)
278 {
279 Console.WriteLine(e.Message);
280 }
281
282 PDFNet.Terminate();
283 }
284
285 }
286}
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
3// Consult LICENSE.txt regarding license information.
4//---------------------------------------------------------------------------------------
5
6package main
7import (
8 "fmt"
9 . "pdftron"
10)
11
12import "pdftron/Samples/LicenseKey/GO"
13// Relative path to the folder containing test files.
14var inputPath = "../../TestFiles/OCR/"
15var outputPath = "../../TestFiles/Output/"
16
17// ---------------------------------------------------------------------------------------
18// The following sample illustrates how to use OCR module
19// --------------------------------------------------------------------------------------
20
21func main(){
22
23 // The first step in every application using PDFNet is to initialize the
24 // library and set the path to common PDF resources. The library is usually
25 // initialized only once, but calling Initialize() multiple times is also fine.
26 PDFNetInitialize(PDFTronLicense.Key)
27
28 // The location of the OCR Module
29 PDFNetAddResourceSearchPath("../../../PDFNetC/Lib/");
30
31 if ! OCRModuleIsModuleAvailable(){
32
33 fmt.Println("Unable to run OCRTest: PDFTron SDK OCR module not available.\n" +
34 "---------------------------------------------------------------\n" +
35 "The OCR module is an optional add-on, available for download\n" +
36 "at http://www.pdftron.com/. If you have already downloaded this\n" +
37 "module, ensure that the SDK is able to find the required files\n" +
38 "using the PDFNet::AddResourceSearchPath() function.")
39
40 }else{
41
42 // Example 1) Process image without specifying options, default language - English - is used
43 // --------------------------------------------------------------------------------
44
45 // A) Setup empty destination doc
46
47 doc := NewPDFDoc()
48
49 // B) Run OCR on the .png with options
50
51 ocrOpts := NewOCROptions()
52 OCRModuleImageToPDF(doc, inputPath + "psychomachia_excerpt.png", ocrOpts)
53
54 // C) Check the result
55
56 doc.Save(outputPath + "psychomachia_excerpt.pdf", uint(0))
57 fmt.Println("Example 1: psychomachia_excerpt.png")
58
59 // Example 2) Process document using multiple languages
60 // --------------------------------------------------------------------------------
61
62 // A) Setup empty destination doc
63
64 doc = NewPDFDoc()
65
66 // B) Setup options with multiple target languages, English will always be considered as secondary language
67
68 opts := NewOCROptions()
69 opts.AddLang("rus")
70 opts.AddLang("deu")
71
72 // C) Run OCR on the .jpg with options
73
74 OCRModuleImageToPDF(doc, inputPath + "multi_lang.jpg", opts)
75
76 // D) Check the result
77
78 doc.Save(outputPath + "multi_lang.pdf", uint(0))
79 fmt.Println("Example 2: multi_lang.jpg")
80
81 // Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image
82 // --------------------------------------------------------------------------------
83
84 // A) Open the .pdf document
85
86 doc = NewPDFDoc(inputPath + "german_kids_song.pdf")
87
88 // B) Setup options with a single language and an ignore zone
89
90 opts = NewOCROptions()
91 opts.AddLang("deu")
92
93 ignoreZones := NewRectCollection()
94 ignoreZones.AddRect(NewRect(424.0, 163.0, 493.0, 730.0))
95 opts.AddIgnoreZonesForPage(ignoreZones, 1)
96
97 // C) Run OCR on the .pdf with options
98
99 OCRModuleProcessPDF(doc, opts)
100
101 // D) check the result
102
103 doc.Save(outputPath + "german_kids_song.pdf", uint(0))
104 fmt.Println("Example 3: german_kids_song.pdf")
105
106 // Example 4) Process multi-page tiff with text/ignore zones specified for each page,
107 // optionally provide English as the target language
108 // --------------------------------------------------------------------------------
109
110 // A) Setup empty destination doc
111
112 doc = NewPDFDoc()
113
114 // B) Setup options with a single language plus text/ignore zones
115
116 opts = NewOCROptions()
117 opts.AddLang("eng")
118
119 ignoreZones = NewRectCollection()
120
121 // ignore signature box in the first 2 pages
122 ignoreZones.AddRect(NewRect(1492.0, 56.0, 2236.0, 432.0))
123 opts.AddIgnoreZonesForPage(ignoreZones, 1)
124
125 opts.AddIgnoreZonesForPage(ignoreZones, 2)
126
127 // can use a combination of ignore and text boxes to focus on the page area of interest,
128 // as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
129 ignoreZones.Clear()
130 ignoreZones.AddRect(NewRect(992.0, 1276.0, 1368.0, 1372.0))
131 opts.AddIgnoreZonesForPage(ignoreZones, 3)
132
133 textZones := NewRectCollection()
134 // we only have text zones selected in page 3
135
136 // select horizontal BUFFER ZONE sign
137 textZones.AddRect(NewRect(900.0, 2384.0, 1236.0, 2480.0))
138
139 // select right vertical BUFFER ZONE sign
140 textZones.AddRect(NewRect(1960.0, 1976.0, 2016.0, 2296.0))
141 // select Lot No.
142 textZones.AddRect(NewRect(696.0, 1028.0, 1196.0, 1128.0))
143
144 // select part of the plan inside the BUFFER ZONE
145 textZones.AddRect(NewRect(428.0, 1484.0, 1784.0, 2344.0))
146 textZones.AddRect(NewRect(948.0, 1288.0, 1672.0, 1476.0))
147 opts.AddTextZonesForPage(textZones, 3)
148
149 // C) Run OCR on the .pdf with options
150
151 OCRModuleImageToPDF(doc, inputPath + "bc_environment_protection.tif", opts)
152
153 // D) check the result
154
155 doc.Save(outputPath + "bc_environment_protection.pdf", uint(0))
156 fmt.Println("Example 4: bc_environment_protection.tif")
157
158 // Example 5) Alternative workflow for extracting OCR result JSON, postprocessing
159 // (e.g., removing words not in the dictionary or filtering special
160 // out special characters), and finally applying modified OCR JSON to the source PDF document
161 // --------------------------------------------------------------------------------
162
163 // A) Open the .pdf document
164
165 doc = NewPDFDoc(inputPath + "zero_value_test_no_text.pdf")
166
167 // B) Run OCR on the .pdf with default English language
168
169 opts = NewOCROptions()
170 json := OCRModuleGetOCRJsonFromPDF(doc, opts)
171
172 // C) Post-processing step (whatever it might be)
173
174 fmt.Println("Have OCR result JSON, re-applying to PDF")
175
176 OCRModuleApplyOCRJsonToPDF(doc, json)
177
178 // D) Check the result
179
180 doc.Save(outputPath + "zero_value_test_no_text.pdf", uint(0))
181 fmt.Println("Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf")
182
183 // Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format,
184 // similar to the one used by TextExtractor
185 // --------------------------------------------------------------------------------
186
187 // A) Setup empty destination doc
188
189 doc = NewPDFDoc()
190
191 // B) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
192 // in the process we convert the source image into PDF.
193 // We reuse this PDF document later to add hidden text layer to it.
194
195 xml := OCRModuleGetOCRXmlFromImage(doc, inputPath + "physics.tif", opts)
196
197 // C) Post-processing step (whatever it might be)
198
199 fmt.Println("Have OCR result XML, re-applying to PDF")
200
201 OCRModuleApplyOCRXmlToPDF(doc, xml)
202
203 // D) Check the result
204
205 doc.Save(outputPath + "physics.pdf", uint(0))
206 fmt.Println("Example 6: extracting and applying OCR XML from physics.tif")
207
208 // Example 7) Resolution can be manually set, when DPI missing from metadata or is wrong
209 // --------------------------------------------------------------------------------
210
211 // A) Setup empty destination doc
212
213 doc = NewPDFDoc()
214
215 // B) Setup options with a text zone
216
217 opts = NewOCROptions()
218 textZones = NewRectCollection()
219 textZones.AddRect(NewRect(140.0, 870.0, 310.0, 920.0))
220 opts.AddTextZonesForPage(textZones, 1)
221
222 // C) Manually override DPI
223 opts.AddDPI(100)
224
225 // D) Run OCR on the .jpg with options
226 OCRModuleImageToPDF(doc, inputPath + "corrupted_dpi.jpg", opts)
227
228 // E) Check the result
229 doc.Save(outputPath + "corrupted_dpi.pdf", uint(0))
230 PDFNetTerminate()
231 fmt.Println("Example 7: converting image with corrupted resolution metadata corrupted_dpi.jpg to pdf with searchable text")
232 }
233}
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5#include <PDF/PDFNet.h>
6#include <PDF/PDFDoc.h>
7#include <PDF/OCRModule.h>
8#include <PDF/OCROptions.h>
9#include <SDF/Obj.h>
10#include <iostream>
11#include "../../LicenseKey/CPP/LicenseKey.h"
12
13using namespace std;
14using namespace pdftron;
15using namespace PDF;
16using namespace SDF;
17
18//---------------------------------------------------------------------------------------
19// The following sample illustrates how to use OCR module
20//---------------------------------------------------------------------------------------
21int main(int argc, char *argv[])
22{
23 try
24 {
25 // The first step in every application using PDFNet is to initialize the
26 // library and set the path to common PDF resources. The library is usually
27 // initialized only once, but calling Initialize() multiple times is also fine.
28 PDFNet::Initialize(LicenseKey);
29 // The location of the OCR Module
30 PDFNet::AddResourceSearchPath("../../../Lib/");
31
32 // if the IRIS OCR module is available, will use that instead of the default
33 const bool use_iris = OCRModule::IsIRISModuleAvailable();
34 if(!OCRModule::IsModuleAvailable())
35 {
36 cout << endl;
37 cout << "Unable to run OCRTest: Apryse SDK OCR module not available." << endl;
38 cout << "---------------------------------------------------------------" << endl;
39 cout << "The OCR module is an optional add-on, available for download" << endl;
40 cout << "at http://www.pdftron.com/. If you have already downloaded this" << endl;
41 cout << "module, ensure that the SDK is able to find the required files" << endl;
42 cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
43 return 0;
44 }
45
46 // Relative path to the folder containing test files.
47 string input_path = "../../TestFiles/OCR/";
48 string output_path = "../../TestFiles/Output/";
49
50
51 //--------------------------------------------------------------------------------
52 // Example 1) Process image without specifying options, default language - English - is used
53 try
54 {
55
56 // A) Setup empty destination doc
57
58 PDFDoc doc;
59
60 // B) Run OCR on the .png without options
61
62 OCROptions opts;
63 if(use_iris) opts.SetOCREngine("iris");
64 OCRModule::ImageToPDF(doc, input_path + "psychomachia_excerpt.png", &opts);
65
66 // C) check the result
67
68 doc.Save(output_path + "psychomachia_excerpt.pdf", 0, 0);
69
70 cout << "Example 1: psychomachia_excerpt.png" << endl;
71
72 }
73 catch(Common::Exception& e)
74 {
75 cout << e << endl;
76 }
77 catch(...)
78 {
79 cout << "Unknown Exception" << endl;
80 }
81
82 //--------------------------------------------------------------------------------
83 // Example 2) Process document using multiple languages
84 try
85 {
86 // A) Setup empty destination doc
87
88 PDFDoc doc;
89
90 // B) Setup options with multiple target languages, English will always be considered as secondary language
91
92 OCROptions opts;
93 if(use_iris) opts.SetOCREngine("iris");
94 opts.AddLang("deu");
95 opts.AddLang("fra");
96 opts.AddLang("eng");
97
98 // C) Run OCR on the .jpg with options
99
100 OCRModule::ImageToPDF(doc, input_path + "multi_lang.jpg", &opts);
101
102 // D) check the result
103
104 doc.Save(output_path + "multi_lang.pdf", 0, 0);
105
106 cout << "Example 2: multi_lang.jpg" << endl;
107
108 }
109 catch (Common::Exception& e)
110 {
111 cout << e << endl;
112 }
113 catch (...)
114 {
115 cout << "Unknown Exception" << endl;
116 }
117
118 //--------------------------------------------------------------------------------
119 // Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image
120 try
121 {
122 // A) Open the .pdf document
123
124 PDFDoc doc((input_path + "german_kids_song.pdf").c_str());
125
126 // B) Setup options with a single language and an ignore zone
127
128 OCROptions opts;
129 if(use_iris) opts.SetOCREngine("iris");
130 opts.AddLang("deu");
131
132 RectCollection ignore_zones;
133 ignore_zones.AddRect(424, 163, 493, 730);
134 opts.AddIgnoreZonesForPage(ignore_zones, 1);
135
136 // C) Run OCR on the .pdf with options
137
138 OCRModule::ProcessPDF(doc, &opts);
139
140 // D) check the result
141
142 PDFDoc doc_out(doc);
143 doc_out.Save(output_path + "german_kids_song.pdf", 0, 0);
144
145 cout << "Example 3: german_kids_song.pdf" << endl;
146 }
147 catch (Common::Exception& e)
148 {
149 cout << e << endl;
150 }
151 catch (...)
152 {
153 cout << "Unknown Exception" << endl;
154 }
155
156 //--------------------------------------------------------------------------------
157 // Example 4) Process multipage tiff with text/ignore zones specified for each page, optionally provide English as the target language
158 try
159 {
160 // A) Setup empty destination doc
161
162 PDFDoc doc;
163
164 // B) Setup options with a single language plus text/ignore zones
165
166 OCROptions opts;
167 if(use_iris) opts.SetOCREngine("iris");
168 opts.AddLang("eng");
169
170 RectCollection ignore_zones;
171 // ignore signature box in the first 2 pages
172 ignore_zones.AddRect(1492, 56, 2236, 432);
173 opts.AddIgnoreZonesForPage(ignore_zones, 1);
174 opts.AddIgnoreZonesForPage(ignore_zones, 2);
175
176 // can use a combination of ignore and text boxes to focus on the page area of interest,
177 // as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
178 ignore_zones.Clear();
179 ignore_zones.AddRect(992, 1276, 1368, 1372);
180 opts.AddIgnoreZonesForPage(ignore_zones, 3);
181
182 RectCollection text_zones;
183 // we only have text zones selected in page 3
184
185 // select horizontal BUFFER ZONE sign
186 text_zones.AddRect(900, 2384, 1236, 2480);
187 // select right vertical BUFFER ZONE sign
188 text_zones.AddRect(1960, 1976, 2016, 2296);
189 // select Lot No.
190 text_zones.AddRect(696, 1028, 1196, 1128);
191
192 // select part of the plan inside the BUFFER ZONE
193 text_zones.AddRect(428, 1484, 1784, 2344);
194 text_zones.AddRect(948, 1288, 1672, 1476);
195 opts.AddTextZonesForPage(text_zones, 3);
196
197 // C) Run OCR on the .tif with options
198
199 OCRModule::ImageToPDF(doc, input_path + "bc_environment_protection.tif", &opts);
200
201 // D) check the result
202
203 doc.Save(output_path + "bc_environment_protection.pdf", 0, 0);
204
205 cout << "Example 4: bc_environment_protection.tif" << endl;
206
207 }
208 catch (Common::Exception& e)
209 {
210 cout << e << endl;
211 }
212 catch (...)
213 {
214 cout << "Unknown Exception" << endl;
215 }
216
217 //--------------------------------------------------------------------------------
218 // Example 5) Alternative workflow for extracting OCR result JSON, postprocessing (e.g., removing words not in the dictionary or filtering special
219 // out special characters), and finally applying modified OCR JSON to the source PDF document
220 try
221 {
222
223 // A) Open the .pdf document
224
225 PDFDoc doc((input_path + "zero_value_test_no_text.pdf").c_str());
226
227 // B) Run OCR on the .pdf with default English language
228 OCROptions opts;
229 if(use_iris) opts.SetOCREngine("iris");
230
231 UString json = OCRModule::GetOCRJsonFromPDF(doc, &opts);
232
233 // C) Post-processing step (whatever it might be)
234
235 cout << "Have OCR result JSON, re-applying to PDF " << endl;
236
237 // D) Apply potentially modified OCR JSON to the PDF
238
239 OCRModule::ApplyOCRJsonToPDF(doc, json);
240
241 // E) Check the result
242
243 PDFDoc doc_out(doc);
244 doc_out.Save(output_path + "zero_value_test_no_text.pdf", 0, 0);
245
246 cout << "Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf" << endl;
247
248 }
249 catch (Common::Exception& e)
250 {
251 cout << e << endl;
252 }
253 catch (...)
254 {
255 cout << "Unknown Exception" << endl;
256 }
257
258 //--------------------------------------------------------------------------------
259 // Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format, similar to the one used by TextExtractor
260 try
261 {
262
263 // A) Setup empty destination doc
264
265 PDFDoc doc;
266
267 // B) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
268 // in the process we convert the source image into PDF. We reuse this PDF document later to add hidden text layer to it.
269
270 OCROptions opts;
271 if(use_iris) opts.SetOCREngine("iris");
272 UString xml = OCRModule::GetOCRXmlFromImage(doc, input_path + "physics.tif", NULL);
273
274 // C) Post-processing step (whatever it might be)
275
276 cout << "Have OCR result XML, re-applying to PDF" << endl;
277
278 // D) Apply potentially modified OCR XML to the PDF
279
280 OCRModule::ApplyOCRXmlToPDF(doc, xml);
281
282 // E) Check the result
283
284 PDFDoc doc_out(doc);
285 doc_out.Save(output_path + "physics.pdf", 0, 0);
286
287 cout << "Example 6: extracting and applying OCR XML from physics.tif" << endl;
288
289 }
290 catch (Common::Exception& e)
291 {
292 cout << e << endl;
293 }
294 catch (...)
295 {
296 cout << "Unknown Exception" << endl;
297 }
298
299 cout << "Done." << endl;
300
301 PDFNet::Terminate();
302 }
303 catch(Common::Exception& e)
304 {
305 cout << e << endl;
306 }
307 catch (...) {
308 cout << "Unknown Exception" << endl;
309 }
310
311 return 0;
312}
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6import com.pdftron.sdf.Obj;
7import com.pdftron.sdf.ObjSet;
8import com.pdftron.sdf.SDFDoc;
9import com.pdftron.pdf.*;
10
11import com.pdftron.common.PDFNetException;
12
13//---------------------------------------------------------------------------------------
14// The following sample illustrates how to use OCR module
15//---------------------------------------------------------------------------------------
16public class OCRTest {
17 public static void main(String[] args) {
18 try {
19 // The first step in every application using PDFNet is to initialize the
20 // library and set the path to common PDF resources. The library is usually
21 // initialized only once, but calling Initialize() multiple times is also fine.
22 PDFNet.initialize(PDFTronLicense.Key());
23 PDFNet.addResourceSearchPath("../../../Lib/");
24
25 boolean use_iris = OCRModule.isIRISModuleAvailable();
26 if( !OCRModule.isModuleAvailable() )
27 {
28 System.out.println("");
29 System.out.println("Unable to run OCRTest: Apryse SDK OCR module not available.");
30 System.out.println("---------------------------------------------------------------");
31 System.out.println("The OCR module is an optional add-on, available for download");
32 System.out.println("at http://www.pdftron.com/. If you have already downloaded this");
33 System.out.println("module, ensure that the SDK is able to find the required files");
34 System.out.println("using the PDFNet.addResourceSearchPath() function.");
35 System.out.println("");
36 return;
37 }
38
39 // Relative path to the folder containing test files.
40 String input_path = "../../TestFiles/OCR/";
41 String output_path = "../../TestFiles/Output/";
42
43 //--------------------------------------------------------------------------------
44 // Example 1) Process image without specifying options, default language - English - is used
45 try (PDFDoc doc = new PDFDoc()) // A) Setup empty destination doc
46 {
47 OCROptions options = new OCROptions();
48 if(use_iris) options.setOCREngine("iris");
49
50 // B) Run OCR on the .png with options
51 OCRModule.imageToPDF(doc, input_path + "psychomachia_excerpt.png", options);
52
53 // C) check the result
54 doc.save(output_path + "psychomachia_excerpt.pdf", SDFDoc.SaveMode.LINEARIZED, null);
55 System.out.println("Example 1: psychomachia_excerpt.png");
56
57 } catch (Exception e) {
58 e.printStackTrace();
59 }
60
61 //--------------------------------------------------------------------------------
62 // Example 2) Process document using multiple languages
63 try (PDFDoc doc = new PDFDoc()) // A) Setup empty destination doc
64 {
65 // B) Setup options with multiple target languages, English will always be considered as secondary language
66 OCROptions options = new OCROptions();
67 if(use_iris) options.setOCREngine("iris");
68 options.addLang("deu");
69 options.addLang("fra");
70 options.addLang("eng");
71
72 // C) Run OCR on the .jpg with options
73 OCRModule.imageToPDF(doc, input_path + "multi_lang.jpg", options);
74
75 // D) check the result
76 doc.save(output_path + "multi_lang.pdf", SDFDoc.SaveMode.LINEARIZED, null);
77 System.out.println("Example 2: multi_lang.jpg");
78 } catch (Exception e) {
79 e.printStackTrace();
80 }
81
82 //--------------------------------------------------------------------------------
83 // Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image
84 try (PDFDoc doc = new PDFDoc(input_path + "german_kids_song.pdf")) // A) Open the .pdf document
85 {
86 // B) Setup options with a single language and an ignore zone
87 OCROptions options = new OCROptions();
88 if(use_iris) options.setOCREngine("iris");
89 options.addLang("deu");
90
91 RectCollection zones = new RectCollection();
92 zones.addRect(424, 163, 493, 730);
93
94 options.addIgnoreZonesForPage(zones, 1);
95
96 // C) Run OCR on the .pdf with options
97 OCRModule.processPDF(doc, options);
98
99 // D) check the result
100 doc.save(output_path + "german_kids_song.pdf", SDFDoc.SaveMode.LINEARIZED, null);
101 System.out.println("Example 3: german_kids_song.pdf");
102 } catch (Exception e) {
103 e.printStackTrace();
104 }
105
106 //--------------------------------------------------------------------------------
107 // Example 4) Process multipage tiff with text/ignore zones specified for each page, optionally provide English as the target language
108
109 try (PDFDoc doc = new PDFDoc()) // A) Setup empty destination doc
110 {
111 // B) Setup options with a single language plus text/ignore zones
112 OCROptions options = new OCROptions();
113 if(use_iris) options.setOCREngine("iris");
114 options.addLang("eng");
115
116 RectCollection zones = new RectCollection();
117 zones.addRect(1492, 56, 2236, 432);
118
119 // ignore signature box in the first 2 pages
120 options.addIgnoreZonesForPage(zones, 1);
121 options.addIgnoreZonesForPage(zones, 2);
122
123 // can use a combination of ignore and text boxes to focus on the page area of interest,
124 // as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
125 zones.clear();
126 zones.addRect(992, 1276, 1368, 1372);
127 options.addIgnoreZonesForPage(zones, 3);
128
129 // we only have text zones selected in page 3
130
131 zones.clear();
132 // select horizontal BUFFER ZONE sign
133 zones.addRect(900, 2384, 1236, 2480);
134 // select right vertical BUFFER ZONE sign
135 zones.addRect(1960, 1976, 2016, 2296);
136 // select Lot No.
137 zones.addRect(696, 1028, 1196, 1128);
138
139 // select part of the plan inside the BUFFER ZONE
140 zones.addRect(428, 1484, 1784, 2344);
141 zones.addRect(948, 1288, 1672, 1476);
142
143 options.addTextZonesForPage(zones, 3);
144
145 // C) Run OCR on the .tif with options
146 OCRModule.imageToPDF(doc, input_path + "bc_environment_protection.tif", options);
147
148 // D) check the result
149 doc.save(output_path + "bc_environment_protection.pdf", SDFDoc.SaveMode.LINEARIZED, null);
150 System.out.println("Example 4: bc_environment_protection.tif");
151 } catch (Exception e) {
152 e.printStackTrace();
153 }
154
155 //--------------------------------------------------------------------------------
156 // Example 5) Alternative workflow for extracting OCR result JSON, postprocessing (e.g., removing words not in the dictionary or filtering special
157 // out special characters), and finally applying modified OCR JSON to the source PDF document
158 try (PDFDoc doc = new PDFDoc(input_path + "zero_value_test_no_text.pdf")) // A) Open the .pdf document
159 {
160 OCROptions options = new OCROptions();
161 if(use_iris) options.setOCREngine("iris");
162
163 // B) Run OCR on the .pdf with default English language
164 String json = OCRModule.getOCRJsonFromPDF(doc, options);
165
166 // C) Post-processing step (whatever it might be), but we just print json here
167 System.out.println("Have OCR result JSON, re-applying to PDF");
168
169 // D) Apply potentially modified OCR JSON to the PDF
170 OCRModule.applyOCRJsonToPDF(doc, json);
171
172 // E) Check the result
173 doc.save(output_path + "zero_value_test_no_text.pdf", SDFDoc.SaveMode.LINEARIZED, null);
174 System.out.println("Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf");
175 } catch (Exception e) {
176 e.printStackTrace();
177 }
178
179 //--------------------------------------------------------------------------------
180 // Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format, similar to the one used by TextExtractor
181 try (PDFDoc doc = new PDFDoc()) // A) Setup empty destination doc
182 {
183 OCROptions options = new OCROptions();
184 if(use_iris) options.setOCREngine("iris");
185
186 // B) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
187 // in the process we convert the source image into PDF. We reuse this PDF document later to add hidden text layer to it.
188 String xml = OCRModule.getOCRXmlFromImage(doc, input_path + "physics.tif", options);
189
190 // C) Post-processing step (whatever it might be), but we just print XML here
191 System.out.println("Have OCR result XML, applying to PDF");
192
193 // D) Apply potentially modified OCR XML to the PDF
194 OCRModule.applyOCRXmlToPDF(doc, xml);
195
196 // E) Check the result
197 doc.save(output_path + "physics.pdf", SDFDoc.SaveMode.LINEARIZED, null);
198 System.out.println("Example 6: extracting and applying OCR XML from physics.tif");
199 }
200 catch (Exception e) {
201 e.printStackTrace();
202 }
203
204 PDFNet.terminate();
205 } catch (Exception e) {
206 e.printStackTrace();
207 }
208 }
209}
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6
7const { PDFNet } = require('@pdftron/pdfnet-node');
8const PDFTronLicense = require('../LicenseKey/LicenseKey');
9
10((exports) => {
11 'use strict';
12
13 //---------------------------------------------------------------------------------------
14 // The following sample illustrates how to use OCR module
15 //---------------------------------------------------------------------------------------
16 exports.runOCRTest = () => {
17 const main = async () => {
18 try {
19
20 PDFNet.addResourceSearchPath('../../lib/');
21
22 const useIRIS = await PDFNet.OCRModule.isIRISModuleAvailable();
23 if (!(await PDFNet.OCRModule.isModuleAvailable())) {
24 console.log('\nUnable to run OCRTest: Apryse SDK OCR module not available.');
25 console.log('---------------------------------------------------------------');
26 console.log('The OCR module is an optional add-on, available for download');
27 console.log('at http://www.pdftron.com/. If you have already downloaded this');
28 console.log('module, ensure that the SDK is able to find the required files');
29 console.log('using the PDFNet.addResourceSearchPath() function.\n');
30
31 return;
32 }
33
34 // Relative path to the folder containing test files.
35 const input_path = '../TestFiles/OCR/';
36 const output_path = '../TestFiles/Output/';
37
38 //--------------------------------------------------------------------------------
39 // Example 1) Process image without specifying options, default language - English - is used
40 try {
41
42 // A) Setup empty destination doc
43 const doc = await PDFNet.PDFDoc.create();
44
45 await doc.initSecurityHandler();
46
47 const opts = new PDFNet.OCRModule.OCROptions();
48 if(useIRIS) opts.setOCREngine('iris');
49
50 // B) Run OCR on the .png with options
51 await PDFNet.OCRModule.imageToPDF(doc, input_path + 'psychomachia_excerpt.png', opts);
52
53 // C) check the result
54 await doc.save(output_path + 'psychomachia_excerpt.pdf', 0);
55
56 console.log('Example 1: psychomachia_excerpt.png');
57
58 } catch (err) {
59 console.log(err);
60 }
61
62 //--------------------------------------------------------------------------------
63 // Example 2) Process document using multiple languages
64 try {
65 // A) Setup empty destination doc
66 const doc = await PDFNet.PDFDoc.create();
67 await doc.initSecurityHandler();
68
69 // B) Setup options with multiple target languages, English will always be considered as secondary language
70 const opts = new PDFNet.OCRModule.OCROptions();
71 if(useIRIS) opts.setOCREngine('iris');
72 opts.addLang('deu');
73 opts.addLang('fra');
74 opts.addLang('eng');
75
76 // C) Run OCR on the .jpg with options
77 await PDFNet.OCRModule.imageToPDF(doc, input_path + 'multi_lang.jpg', opts);
78
79 // D) check the result
80 await doc.save(output_path + 'multi_lang.pdf', 0);
81
82 console.log('Example 2: multi_lang.jpg');
83 } catch (err) {
84 console.log(err);
85 }
86
87 //--------------------------------------------------------------------------------
88 // Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image
89 try {
90 // A) Open the .pdf document
91 const doc = await PDFNet.PDFDoc.createFromFilePath(input_path + 'german_kids_song.pdf');
92 doc.initSecurityHandler();
93
94 // B) Setup options with a single language and an ignore zone
95 const opts = new PDFNet.OCRModule.OCROptions();
96 if(useIRIS) opts.setOCREngine('iris');
97 opts.addLang('deu');
98
99 const ignore_zones = [];
100 ignore_zones.push(new PDFNet.Rect(424, 163, 493, 730));
101 opts.addIgnoreZonesForPage(ignore_zones, 1);
102
103 // C) Run OCR on the .pdf with options
104 await PDFNet.OCRModule.processPDF(doc, opts);
105
106 // D) check the result
107 await doc.save(output_path + 'german_kids_song.pdf', 0);
108
109 console.log('Example 3: german_kids_song.pdf');
110 } catch (err) {
111 console.log(err);
112 }
113
114 //--------------------------------------------------------------------------------
115 // Example 4) Process multipage tiff with text/ignore zones specified for each page, optionally provide English as the target language
116 try {
117 // A) Setup empty destination doc
118 const doc = await PDFNet.PDFDoc.create();
119 await doc.initSecurityHandler();
120
121 // B) Setup options with a single language plus text/ignore zones
122 const opts = new PDFNet.OCRModule.OCROptions();
123 if(useIRIS) opts.setOCREngine('iris');
124 opts.addLang('eng');
125
126 var ignore_zones = [];
127 // ignore signature box in the first 2 pages
128 ignore_zones.push(new PDFNet.Rect(1492, 56, 2236, 432));
129 opts.addIgnoreZonesForPage(ignore_zones, 1);
130
131 ignore_zones = [];
132 ignore_zones.push(new PDFNet.Rect(1492, 56, 2236, 432));
133 opts.addIgnoreZonesForPage(ignore_zones, 2);
134
135 // can use a combination of ignore and text boxes to focus on the page area of interest,
136 // as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
137 ignore_zones = [];
138 ignore_zones.push(new PDFNet.Rect(992, 1276, 1368, 1372));
139 opts.addIgnoreZonesForPage(ignore_zones, 3);
140
141
142 const text_zones = [];
143 // we only have text zones selected in page 3
144
145 // select horizontal BUFFER ZONE sign
146 text_zones.push(new PDFNet.Rect(900, 2384, 1236, 2480));
147 // select right vertical BUFFER ZONE sign
148 text_zones.push(new PDFNet.Rect(1960, 1976, 2016, 2296));
149 // select Lot No.
150 text_zones.push(new PDFNet.Rect(696, 1028, 1196, 1128));
151
152 // select part of the plan inside the BUFFER ZONE
153 text_zones.push(new PDFNet.Rect(428, 1484, 1784, 2344));
154 text_zones.push(new PDFNet.Rect(948, 1288, 1672, 1476));
155 opts.addTextZonesForPage(text_zones, 3);
156
157 // C) Run OCR on the .tif with options
158 await PDFNet.OCRModule.imageToPDF(doc, input_path + 'bc_environment_protection.tif', opts);
159
160 // D) check the result
161 await doc.save(output_path + 'bc_environment_protection.pdf', 0);
162
163 console.log('Example 4: bc_environment_protection.tif');
164 } catch (err) {
165 console.log(err);
166 }
167
168 //--------------------------------------------------------------------------------
169 // Example 5) Alternative workflow for extracting OCR result JSON, postprocessing (e.g., removing words not in the dictionary or filtering special
170 // out special characters), and finally applying modified OCR JSON to the source PDF document
171 try {
172 // A) Open the .pdf document
173 const doc = await PDFNet.PDFDoc.createFromFilePath(input_path + 'zero_value_test_no_text.pdf');
174 await doc.initSecurityHandler();
175
176 const opts = new PDFNet.OCRModule.OCROptions();
177 if(useIRIS) opts.setOCREngine('iris');
178
179 // B) Run OCR on the .pdf with default English language
180 const json = await PDFNet.OCRModule.getOCRJsonFromPDF(doc, opts);
181
182 // C) Post-processing step (whatever it might be)
183 console.log('Have OCR result JSON, re-applying to PDF ');
184
185 // D) Apply potentially modified OCR JSON to the PDF
186 await PDFNet.OCRModule.applyOCRJsonToPDF(doc, json);
187
188 // E) Check the result
189 await doc.save(output_path + 'zero_value_test_no_text.pdf', 0);
190
191 console.log('Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf');
192 } catch (err) {
193 console.log(err);
194 }
195
196 //--------------------------------------------------------------------------------
197 // Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format, similar to the one used by TextExtractor
198 try {
199
200 // A) Setup empty destination doc
201 const doc = await PDFNet.PDFDoc.create();
202 await doc.initSecurityHandler();
203
204 const opts = new PDFNet.OCRModule.OCROptions();
205 if(useIRIS) opts.setOCREngine('iris');
206
207 // B) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
208 // in the process we convert the source image into PDF. We reuse this PDF document later to add hidden text layer to it.
209 const xml = await PDFNet.OCRModule.getOCRXmlFromImage(doc, input_path + 'physics.tif', opts);
210
211 // C) Post-processing step (whatever it might be)
212 console.log('Have OCR result XML, re-applying to PDF');
213
214 // D) Apply potentially modified OCR XML to the PDF
215 await PDFNet.OCRModule.applyOCRXmlToPDF(doc, xml);
216
217 // E) Check the result
218 await doc.save(output_path + 'physics.pdf', 0);
219
220 console.log('Example 6: extracting and applying OCR XML from physics.tif');
221 } catch (err) {
222 console.log(err);
223 }
224 console.log('Done.');
225 } catch (err) {
226 console.log(err);
227 }
228 };
229 PDFNet.runWithCleanup(main, PDFTronLicense.Key).catch(function(error) {
230 console.log('Error: ' + JSON.stringify(error));
231 }).then(function(){ return PDFNet.shutdown(); });
232 };
233 exports.runOCRTest();
234})(exports);
235// eslint-disable-next-line spaced-comment
236//# sourceURL=OCRTest.js
1<?php
2//---------------------------------------------------------------------------------------
3// Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
4// Consult LICENSE.txt regarding license information.
5//---------------------------------------------------------------------------------------
6if(file_exists("../../../PDFNetC/Lib/PDFNetPHP.php"))
7include("../../../PDFNetC/Lib/PDFNetPHP.php");
8include("../../LicenseKey/PHP/LicenseKey.php");
9
10// Relative path to the folder containing the test files.
11$input_path = getcwd()."/../../TestFiles/OCR/";
12$output_path = getcwd()."/../../TestFiles/Output/";
13
14//---------------------------------------------------------------------------------------
15// The following sample illustrates how to use OCR module
16//---------------------------------------------------------------------------------------
17
18 // The first step in every application using PDFNet is to initialize the
19 // library and set the path to common PDF resources. The library is usually
20 // initialized only once, but calling Initialize() multiple times is also fine.
21 PDFNet::Initialize($LicenseKey);
22 PDFNet::GetSystemFontList(); // Wait for fonts to be loaded if they haven't already. This is done because PHP can run into errors when shutting down if font loading is still in progress.
23
24 // The location of the OCR Module
25 PDFNet::AddResourceSearchPath("../../../Lib/");
26 if(!OCRModule::IsModuleAvailable()) {
27 echo "Unable to run OCRTest: PDFTron SDK OCR module not available.\n
28 ---------------------------------------------------------------\n
29 The OCR module is an optional add-on, available for download\n
30 at https://dev.apryse.com/. If you have already downloaded this\n
31 module, ensure that the SDK is able to find the required files\n
32 using the PDFNet::AddResourceSearchPath() function.\n";
33 } else
34 {
35 //--------------------------------------------------------------------------------
36 // Example 1) Process image without specifying options, default language - English - is used
37
38
39 // A) Setup empty destination doc
40 $doc = new PDFDoc();
41
42 // B) Run OCR on the .png with options
43
44 OCRModule::ImageToPDF($doc, $input_path."psychomachia_excerpt.png", NULL);
45
46 // C) check the result
47
48 $doc->Save($output_path."psychomachia_excerpt.pdf", 0);
49
50 echo "Example 1: psychomachia_excerpt.png \n";
51
52
53 //--------------------------------------------------------------------------------
54 // Example 2) Process document using multiple languages
55
56 // A) Setup empty destination doc
57
58 $doc = new PDFDoc();
59
60 // B) Setup options with multiple target languages, English will always be considered as secondary language
61
62 $opts = new OCROptions();
63 $opts->AddLang("deu");
64 $opts->AddLang("fra");
65 $opts->AddLang("eng");
66
67 // B) Run OCR on the .png with options
68
69 OCRModule::ImageToPDF($doc, $input_path."multi_lang.jpg", $opts);
70
71 // C) check the result
72
73 $doc->Save($output_path."multi_lang.pdf", 0);
74
75 echo "Example 2: multi_lang.jpg \n";
76
77
78 //--------------------------------------------------------------------------------
79 // Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image
80
81 // A) Open the .pdf document
82
83 $doc = new PDFDoc($input_path."german_kids_song.pdf");
84
85 // B) Setup options with a single language and an ignore zone
86
87 $opts = new OCROptions();
88 $opts->AddLang("deu");
89
90 $ignore_zones = new RectCollection();
91 $rect = new Rect(424.0, 163.0, 493.0, 730.0);
92 $ignore_zones->AddRect($rect);
93 $opts->AddIgnoreZonesForPage($ignore_zones, 1);
94
95 // C) Run OCR on the .pdf with options
96
97 OCRModule::ProcessPDF($doc, $opts);
98
99 // D) check the result
100
101 $doc->Save($output_path."german_kids_song.pdf", 0);
102
103 echo "Example 3: german_kids_song.pdf \n";
104
105 //--------------------------------------------------------------------------------
106 // Example 4) Process multipage tiff with text/ignore zones specified for each page, optionally provide English as the target language
107
108 // A) Setup empty destination doc
109
110 $doc = new PDFDoc();
111
112 // B) Setup options with a single language plus text/ignore zones
113
114 $opts = new OCROptions();
115 $opts->AddLang("eng");
116
117 $ignore_zones = new RectCollection();
118 // ignore signature box in the first 2 pages
119 $ignore_zones->AddRect(new Rect(1492.0, 56.0, 2236.0, 432.0));
120 $opts->AddIgnoreZonesForPage($ignore_zones, 1);
121 $opts->AddIgnoreZonesForPage($ignore_zones, 2);
122
123 // can use a combination of ignore and text boxes to focus on the page area of interest,
124 // as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
125 $ignore_zones->Clear();
126 $ignore_zones->AddRect(new Rect(992.0, 1276.0, 1368.0, 1372.0));
127 $opts->AddIgnoreZonesForPage($ignore_zones, 3);
128
129
130 $text_zones = new RectCollection();
131 // we only have text zones selected in page 3
132
133 // select horizontal BUFFER ZONE sign
134 $text_zones->AddRect(new Rect(900.0, 2384.0, 1236.0, 2480.0));
135 // select right vertical BUFFER ZONE sign
136 $text_zones->AddRect(new Rect(1960.0, 1976.0, 2016.0, 2296.0));
137 // select Lot No.
138 $text_zones->AddRect(new Rect(696.0, 1028.0, 1196.0, 1128.0));
139
140 // select part of the plan inside the BUFFER ZONE
141 $text_zones->AddRect(new Rect(428.0, 1484.0, 1784.0, 2344.0));
142 $text_zones->AddRect(new Rect(948.0, 1288.0, 1672.0, 1476.0));
143 $opts->AddTextZonesForPage($text_zones, 3);
144
145 // C) Run OCR on the .pdf with options
146
147 OCRModule::ImageToPDF($doc, $input_path."bc_environment_protection.tif", $opts);
148
149 // D) check the result
150
151 $doc->Save($output_path."bc_environment_protection.pdf", 0);
152
153 echo "Example 4: bc_environment_protection.tif \n";
154
155
156 //--------------------------------------------------------------------------------
157 // Example 5) Alternative workflow for extracting OCR result JSON, postprocessing (e.g., removing words not in the dictionary or filtering special
158 // out special characters), and finally applying modified OCR JSON to the source PDF document
159 // A) Setup empty destination doc
160
161 $doc = new PDFDoc($input_path."zero_value_test_no_text.pdf");
162
163 // B) Run OCR on the .pdf with default English language
164
165 $json = OCRModule::GetOCRJsonFromPDF($doc, NULL);
166
167 // C) Post-processing step (whatever it might be)
168
169 echo "Have OCR result JSON, re-applying to PDF \n";
170
171 OCRModule::ApplyOCRJsonToPDF($doc, $json);
172
173 // D) check the result
174
175 $doc->Save($output_path."zero_value_test_no_text.pdf", 0);
176
177 echo "Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf \n";
178
179
180 //--------------------------------------------------------------------------------
181 // Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format, similar to the one used by TextExtractor
182
183 // A) Setup empty destination doc
184
185 $doc = new PDFDoc();
186
187 // B) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
188 // in the process we convert the source image into PDF. We reuse this PDF document later to add hidden text layer to it.
189
190 $xml = OCRModule::GetOCRXmlFromImage($doc, $input_path."physics.tif", NULL);
191
192 // C) Post-processing step (whatever it might be)
193
194 echo "Have OCR result XML, re-applying to PDF \n";
195
196 OCRModule::ApplyOCRXmlToPDF($doc, $xml);
197
198 // D) check the result
199
200 $doc->Save($output_path."physics.pdf", 0);
201
202 echo "Example 6: extracting and applying OCR XML from physics.tif \n";
203
204 echo "Done. \n";
205
206
207 //--------------------------------------------------------------------------------
208 // Example 7) Resolution can be manually set, when DPI missing from metadata or is wrong
209
210 // A) Setup empty destination doc
211
212 $doc = new PDFDoc();
213
214 // B) Setup options with a text zone
215
216 $opts = new OCROptions();
217 $text_zones = new RectCollection();
218 $text_zones->AddRect(new Rect(140.0, 870.0, 310.0, 920.0));
219 $opts->AddTextZonesForPage($text_zones, 1);
220
221 // C) Manually override DPI
222
223 $opts->AddDPI(100);
224
225 // D) Run OCR on the .jpg with options
226
227 OCRModule::ImageToPDF($doc, $input_path."corrupted_dpi.jpg", $opts);
228
229 // E) check the result
230
231 $doc->Save($output_path."corrupted_dpi.pdf", 0);
232
233 echo "Example 7: converting image with corrupted resolution metadata corrupted_dpi.jpg to pdf with searchable text \n";
234
235 }
236 PDFNet::Terminate();
237
238?>
1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6import site
7site.addsitedir("../../../PDFNetC/Lib")
8import sys
9from PDFNetPython import *
10
11sys.path.append("../../LicenseKey/PYTHON")
12from LicenseKey import *
13
14# Relative path to the folder containing test files.
15input_path = "../../TestFiles/OCR/"
16output_path = "../../TestFiles/Output/"
17
18# ---------------------------------------------------------------------------------------
19# The following sample illustrates how to use OCR module
20# --------------------------------------------------------------------------------------
21
22def main():
23
24 # The first step in every application using PDFNet is to initialize the
25 # library and set the path to common PDF resources. The library is usually
26 # initialized only once, but calling Initialize() multiple times is also fine.
27 PDFNet.Initialize(LicenseKey)
28
29 # The location of the OCR Module
30 PDFNet.AddResourceSearchPath("../../../PDFNetC/Lib/");
31
32 if not OCRModule.IsModuleAvailable():
33
34 print("""
35 Unable to run OCRTest: PDFTron SDK OCR module not available.
36 ---------------------------------------------------------------
37 The OCR module is an optional add-on, available for download
38 at https://dev.apryse.com/. If you have already downloaded this
39 module, ensure that the SDK is able to find the required files
40 using the PDFNet::AddResourceSearchPath() function.""")
41
42 else:
43
44 # Example 1) Process image without specifying options, default language - English - is used
45 # --------------------------------------------------------------------------------
46
47 # A) Setup empty destination doc
48
49 doc = PDFDoc()
50
51 # B) Run OCR on the .png with options
52
53 OCRModule.ImageToPDF(doc, input_path + "psychomachia_excerpt.png", None)
54
55 # C) Check the result
56
57 doc.Save(output_path + "psychomachia_excerpt.pdf", 0)
58 print("Example 1: psychomachia_excerpt.png")
59
60 # Example 2) Process document using multiple languages
61 # --------------------------------------------------------------------------------
62
63 # A) Setup empty destination doc
64
65 doc = PDFDoc()
66
67 # B) Setup options with multiple target languages, English will always be considered as secondary language
68
69 opts = OCROptions()
70 opts.AddLang("deu")
71 opts.AddLang("fra")
72 opts.AddLang("eng")
73
74 # C) Run OCR on the .jpg with options
75
76 OCRModule.ImageToPDF(doc, input_path + "multi_lang.jpg", opts)
77
78 # D) Check the result
79
80 doc.Save(output_path + "multi_lang.pdf", 0)
81 print("Example 2: multi_lang.jpg")
82
83 # Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image
84 # --------------------------------------------------------------------------------
85
86 # A) Open the .pdf document
87
88 doc = PDFDoc(input_path + "german_kids_song.pdf")
89
90 # B) Setup options with a single language and an ignore zone
91
92 opts = OCROptions()
93 opts.AddLang("deu")
94
95 ignore_zones = RectCollection()
96 ignore_zones.AddRect(Rect(424, 163, 493, 730))
97 opts.AddIgnoreZonesForPage(ignore_zones, 1)
98
99 # C) Run OCR on the .pdf with options
100
101 OCRModule.ProcessPDF(doc, opts)
102
103 # D) check the result
104
105 doc.Save(output_path + "german_kids_song.pdf", 0)
106 print("Example 3: german_kids_song.pdf")
107
108 # Example 4) Process multi-page tiff with text/ignore zones specified for each page,
109 # optionally provide English as the target language
110 # --------------------------------------------------------------------------------
111
112 # A) Setup empty destination doc
113
114 doc = PDFDoc()
115
116 # B) Setup options with a single language plus text/ignore zones
117
118 opts = OCROptions()
119 opts.AddLang("eng")
120
121 ignore_zones = RectCollection()
122
123 # ignore signature box in the first 2 pages
124 ignore_zones.AddRect(Rect(1492, 56, 2236, 432))
125 opts.AddIgnoreZonesForPage(ignore_zones, 1)
126
127 opts.AddIgnoreZonesForPage(ignore_zones, 2)
128
129 # can use a combination of ignore and text boxes to focus on the page area of interest,
130 # as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
131 ignore_zones.Clear()
132 ignore_zones.AddRect(Rect(992, 1276, 1368, 1372))
133 opts.AddIgnoreZonesForPage(ignore_zones, 3)
134
135 text_zones = RectCollection()
136 # we only have text zones selected in page 3
137
138 # select horizontal BUFFER ZONE sign
139 text_zones.AddRect(Rect(900, 2384, 1236, 2480))
140
141 # select right vertical BUFFER ZONE sign
142 text_zones.AddRect(Rect(1960, 1976, 2016, 2296))
143 # select Lot No.
144 text_zones.AddRect(Rect(696, 1028, 1196, 1128))
145
146 # select part of the plan inside the BUFFER ZONE
147 text_zones.AddRect(Rect(428, 1484, 1784, 2344))
148 text_zones.AddRect(Rect(948, 1288, 1672, 1476))
149 opts.AddTextZonesForPage(text_zones, 3)
150
151 # C) Run OCR on the .pdf with options
152
153 OCRModule.ImageToPDF(doc, input_path + "bc_environment_protection.tif", opts)
154
155 # D) check the result
156
157 doc.Save(output_path + "bc_environment_protection.pdf", 0)
158 print("Example 4: bc_environment_protection.tif")
159
160 # Example 5) Alternative workflow for extracting OCR result JSON, postprocessing
161 # (e.g., removing words not in the dictionary or filtering special
162 # out special characters), and finally applying modified OCR JSON to the source PDF document
163 # --------------------------------------------------------------------------------
164
165 # A) Open the .pdf document
166
167 doc = PDFDoc(input_path + "zero_value_test_no_text.pdf")
168
169 # B) Run OCR on the .pdf with default English language
170
171 json = OCRModule.GetOCRJsonFromPDF(doc, None)
172
173 # C) Post-processing step (whatever it might be)
174
175 print("Have OCR result JSON, re-applying to PDF")
176
177 OCRModule.ApplyOCRJsonToPDF(doc, json)
178
179 # D) Check the result
180
181 doc.Save(output_path + "zero_value_test_no_text.pdf", 0)
182 print("Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf")
183
184 # Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format,
185 # similar to the one used by TextExtractor
186 # --------------------------------------------------------------------------------
187
188 # A) Setup empty destination doc
189
190 doc = PDFDoc()
191
192 # B) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
193 # in the process we convert the source image into PDF.
194 # We reuse this PDF document later to add hidden text layer to it.
195
196 xml = OCRModule.GetOCRXmlFromImage(doc, input_path + "physics.tif", None)
197
198 # C) Post-processing step (whatever it might be)
199
200 print("Have OCR result XML, re-applying to PDF")
201
202 OCRModule.ApplyOCRXmlToPDF(doc, xml)
203
204 # D) Check the result
205
206 doc.Save(output_path + "physics.pdf", 0)
207 print("Example 6: extracting and applying OCR XML from physics.tif")
208
209 # Example 7) Resolution can be manually set, when DPI missing from metadata or is wrong
210 # --------------------------------------------------------------------------------
211
212 # A) Setup empty destination doc
213
214 doc = PDFDoc()
215
216 # B) Setup options with a text zone
217
218 opts = OCROptions()
219 text_zones = RectCollection()
220 text_zones.AddRect(Rect(140, 870, 310, 920))
221 opts.AddTextZonesForPage(text_zones, 1)
222
223 # C) Manually override DPI
224 opts.AddDPI(100)
225
226 # D) Run OCR on the .jpg with options
227 OCRModule.ImageToPDF(doc, input_path + "corrupted_dpi.jpg", opts)
228
229 # E) Check the result
230 doc.Save(output_path + "corrupted_dpi.pdf", 0)
231 PDFNet.Terminate()
232 print("Example 7: converting image with corrupted resolution metadata corrupted_dpi.jpg to pdf with searchable text")
233
234
235if __name__ == '__main__':
236 main()
1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6require '../../../PDFNetC/Lib/PDFNetRuby'
7include PDFNetRuby
8require '../../LicenseKey/RUBY/LicenseKey'
9
10$stdout.sync = true
11
12# Relative path to the folder containing test files.
13input_path = "../../TestFiles/OCR/"
14output_path = "../../TestFiles/Output/"
15
16#---------------------------------------------------------------------------------------
17# The following sample illustrates how to use OCR module
18#---------------------------------------------------------------------------------------
19
20 # The first step in every application using PDFNet is to initialize the
21 # library and set the path to common PDF resources. The library is usually
22 # initialized only once, but calling Initialize multiple times is also fine.
23 PDFNet.Initialize(PDFTronLicense.Key)
24
25 # The location of the OCR Module
26 PDFNet.AddResourceSearchPath("../../../PDFNetC/Lib/");
27
28 #Example 1) Convert the first page to PNG and TIFF at 92 DPI.
29
30 begin
31 if !OCRModule.IsModuleAvailable
32
33 puts 'Unable to run OCRTest: PDFTron SDK OCR module not available.'
34 puts '---------------------------------------------------------------'
35 puts 'The OCR module is an optional add-on, available for download'
36 puts 'at https://dev.apryse.com/. If you have already downloaded this'
37 puts 'module, ensure that the SDK is able to find the required files'
38 puts 'using the PDFNet::AddResourceSearchPath() function.'
39
40 else
41
42 # Example 1) Process image without specifying options, default language - English - is used
43 # --------------------------------------------------------------------------------
44
45 # A) Setup empty destination doc
46 doc = PDFDoc.new
47
48 # B) Run OCR on the .png with options
49
50 OCRModule.ImageToPDF(doc, input_path + "psychomachia_excerpt.png", nil)
51
52 # C) Check the result
53
54 doc.Save(output_path + "psychomachia_excerpt.pdf", 0)
55 puts "Example 1: psychomachia_excerpt.png"
56
57 doc.Close
58
59 # Example 2) Process document using multiple languages
60 # --------------------------------------------------------------------------------
61
62 # A) Setup empty destination doc
63
64 doc = PDFDoc.new
65
66 # B) Setup options with multiple target languages, English will always be considered as secondary language
67
68 opts = OCROptions.new
69 opts.AddLang("deu")
70 opts.AddLang("fra")
71 opts.AddLang("eng")
72
73 # C) Run OCR on the .jpg with options
74
75 OCRModule.ImageToPDF(doc, input_path + "multi_lang.jpg", opts)
76
77 # D) Check the result
78
79 doc.Save(output_path + "multi_lang.pdf", 0)
80 puts "Example 2: multi_lang.jpg"
81
82 doc.Close
83
84 # Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image
85 # --------------------------------------------------------------------------------
86
87 # A) Open the .pdf document
88
89
90 doc = PDFDoc.new(input_path + "german_kids_song.pdf")
91
92 # B) Setup options with a single language and an ignore zone
93
94 opts = OCROptions.new
95 opts.AddLang("deu")
96
97 ignore_zones = RectCollection.new
98 ignore_zones.AddRect(Rect.new(424, 163, 493, 730))
99 opts.AddIgnoreZonesForPage(ignore_zones, 1)
100
101 # C) Run OCR on the .pdf with options
102
103 OCRModule.ProcessPDF(doc, nil)
104
105 # D) check the result
106
107 doc.Save(output_path + "german_kids_song.pdf", 0)
108 puts "Example 3: german_kids_song.pdf"
109
110 doc.Close
111
112 # Example 4) Process multi-page tiff with text/ignore zones specified for each page,
113 # optionally provide English as the target language
114 # --------------------------------------------------------------------------------
115
116 # A) Setup empty destination doc
117
118 doc = PDFDoc.new
119
120 # B) Setup options with a single language plus text/ignore zones
121
122 opts = OCROptions.new
123 opts.AddLang("eng")
124
125 ignore_zones = RectCollection.new
126
127 # ignore signature box in the first 2 pages
128 ignore_zones.AddRect(Rect.new(1492, 56, 2236, 432))
129 opts.AddIgnoreZonesForPage(ignore_zones, 1)
130
131 opts.AddIgnoreZonesForPage(ignore_zones, 2)
132
133 # can use a combination of ignore and text boxes to focus on the page area of interest,
134 # as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
135 ignore_zones.Clear
136 ignore_zones.AddRect(Rect.new(992, 1276, 1368, 1372))
137 opts.AddIgnoreZonesForPage(ignore_zones, 3)
138
139 text_zones = RectCollection.new
140 # we only have text zones selected in page 3
141
142 # select horizontal BUFFER ZONE sign
143 text_zones.AddRect(Rect.new(900, 2384, 1236, 2480))
144
145 # select right vertical BUFFER ZONE sign
146 text_zones.AddRect(Rect.new(1960, 1976, 2016, 2296))
147 # select Lot No.
148 text_zones.AddRect(Rect.new(696, 1028, 1196, 1128))
149
150 # select part of the plan inside the BUFFER ZONE
151 text_zones.AddRect(Rect.new(428, 1484, 1784, 2344))
152 text_zones.AddRect(Rect.new(948, 1288, 1672, 1476))
153 opts.AddTextZonesForPage(text_zones, 3)
154
155 # C) Run OCR on the .pdf with options
156
157 OCRModule.ImageToPDF(doc, input_path + "bc_environment_protection.tif", opts)
158
159 # D) check the result
160
161 doc.Save(output_path + "bc_environment_protection.pdf", 0)
162 puts "Example 4: bc_environment_protection.tif"
163
164 doc.Close
165
166 # Example 5) Alternative workflow for extracting OCR result JSON, postprocessing
167 # (e.g., removing words not in the dictionary or filtering special
168 # out special characters), and finally applying modified OCR JSON to the source PDF document
169 # --------------------------------------------------------------------------------
170
171 # A) Open the .pdf document
172
173 doc = PDFDoc.new(input_path + "zero_value_test_no_text.pdf")
174
175 # B) Run OCR on the .pdf with default English language
176
177 json = OCRModule.GetOCRJsonFromPDF(doc, nil)
178
179 # C) Post-processing step (whatever it might be)
180
181 puts "Have OCR result JSON, re-applying to PDF"
182
183 OCRModule.ApplyOCRJsonToPDF(doc, json)
184
185 # D) Check the result
186
187 doc.Save(output_path + "zero_value_test_no_text.pdf", 0)
188 puts "Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf"
189
190 doc.Close
191
192 # Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format,
193 # similar to the one used by TextExtractor
194 # --------------------------------------------------------------------------------
195
196 # A) Setup empty destination doc
197
198 doc = PDFDoc.new
199
200 # B) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
201 # in the process we convert the source image into PDF.
202 # We reuse this PDF document later to add hidden text layer to it.
203
204 xml = OCRModule.GetOCRXmlFromImage(doc, input_path + "physics.tif", nil)
205
206 # C) Post-processing step (whatever it might be)
207
208 puts "Have OCR result XML, re-applying to PDF"
209
210 OCRModule.ApplyOCRXmlToPDF(doc, xml)
211
212 # D) Check the result
213
214 doc.Save(output_path + "physics.pdf", 0)
215 puts "Example 6: extracting and applying OCR XML from physics.tif"
216
217 doc.Close
218
219
220 # Example 7) Resolution can be manually set, when DPI missing from metadata or is wrong
221 # --------------------------------------------------------------------------------
222
223 # A) Setup empty destination doc
224
225 doc = PDFDoc.new
226
227 # B) Setup options with a text zone
228
229 opts = OCROptions.new
230 text_zones = RectCollection.new
231 text_zones.AddRect(Rect.new(140, 870, 310, 920))
232 opts.AddIgnoreZonesForPage(text_zones, 1)
233
234 # C) Manually override DPI
235
236 opts.AddDPI(100)
237
238 # D) Run OCR on the .jpg with options
239
240 OCRModule.ImageToPDF(doc, input_path + "corrupted_dpi.jpg", opts)
241
242 # E) Check the result
243
244 doc.Save(output_path + "corrupted_dpi.pdf", 0)
245 puts "Example 7: converting image with corrupted resolution metadata corrupted_dpi.jpg to pdf with searchable text"
246
247 doc.Close
248
249 end
250 rescue Exception=>e
251 puts e
252
253 end
254 PDFNet.Terminate
1'---------------------------------------------------------------------------------------
2' Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3' Consult legal.txt regarding legal and license information.
4'---------------------------------------------------------------------------------------
5Imports System
6
7Imports pdftron
8Imports pdftron.Common
9Imports pdftron.SDF
10Imports pdftron.PDF
11
12' <summary>
13'---------------------------------------------------------------------------------------
14' The following sample illustrates how to use OCR module
15'---------------------------------------------------------------------------------------
16' </summary>
17Module OCRTestVB
18 Dim pdfNetLoader As PDFNetLoader
19 Sub New()
20 pdfNetLoader = pdftron.PDFNetLoader.Instance()
21 End Sub
22
23 ' The main entry point for the application.
24 Sub Main()
25
26 ' The first step in every application using PDFNet is to initialize the
27 ' library and set the path to common PDF resources. The library is usually
28 ' initialized only once, but calling Initialize() multiple times is also fine.
29 PDFNet.Initialize(PDFTronLicense.Key)
30
31 ' Can optionally set path to the OCR module
32 PDFNet.AddResourceSearchPath("../../../../../Lib/")
33
34 Dim useOCR As Boolean = OCRModule.IsIRISModuleAvailable()
35 If Not OCRModule.IsModuleAvailable() Then
36 Console.WriteLine("")
37 Console.WriteLine("Unable to run OCRTest: Apryse SDK OCR module not available.")
38 Console.WriteLine("---------------------------------------------------------------")
39 Console.WriteLine("The OCR module is an optional add-on, available for download")
40 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this")
41 Console.WriteLine("module, ensure that the SDK is able to find the required files")
42 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
43 Console.WriteLine("")
44 Return
45 End If
46
47 ' Relative path to the folder containing test files.
48 Dim input_path As String = "../../../../TestFiles/OCR/"
49 Dim output_path As String = "../../../../TestFiles/Output/"
50
51 '--------------------------------------------------------------------------------
52 ' Example 1) Process image
53 Try
54 ' A) Setup empty destination doc.
55 Using doc As PDFDoc = New PDFDoc()
56
57 ' B) Set English as the language of choice
58 Dim opts As OCROptions = New OCROptions()
59 If useOCR Then opts.SetOCREngine("iris")
60 opts.AddLang("eng")
61
62 ' C) Run OCR on the .png with options
63 OCRModule.ImageToPDF(doc, input_path + "psychomachia_excerpt.png", opts)
64
65 ' D) Check the result
66 doc.Save(output_path + "psychomachia_excerpt.pdf", SDFDoc.SaveOptions.e_remove_unused)
67
68 Console.WriteLine("Example 1: psychomachia_excerpt.png")
69
70 End Using
71 Catch e As PDFNetException
72 Console.WriteLine(e.Message)
73 End Try
74
75 '--------------------------------------------------------------------------------
76 ' Example 2) Process document using multiple languages
77 Try
78 ' A) Setup empty destination doc.
79 Using doc As PDFDoc = New PDFDoc()
80
81 ' B) Setup options with multiple target languages, English will always be considered as secondary language
82 Dim opts As OCROptions = New OCROptions()
83 If useOCR Then opts.SetOCREngine("iris")
84 opts.AddLang("deu")
85 opts.AddLang("fra")
86 opts.AddLang("eng")
87
88 ' C) Run OCR on the .jpg with options
89 OCRModule.ImageToPDF(doc, input_path + "multi_lang.jpg", opts)
90
91 ' D) Check the result
92 doc.Save(output_path + "multi_lang.pdf", SDFDoc.SaveOptions.e_remove_unused)
93
94 Console.WriteLine("Example 2: multi_lang.jpg")
95
96 End Using
97 Catch e As PDFNetException
98 Console.WriteLine(e.Message)
99 End Try
100
101
102 '--------------------------------------------------------------------------------
103 ' Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image
104 Try
105 ' A) Open the .pdf document.
106 Using doc As PDFDoc = New PDFDoc(input_path + "german_kids_song.pdf")
107
108 ' B) Setup options with a single language and an ignore zone
109 Dim opts As OCROptions = New OCROptions()
110 If useOCR Then opts.SetOCREngine("iris")
111 opts.AddLang("deu")
112
113 Dim zones As RectCollection = New RectCollection()
114 zones.AddRect(424, 163, 493, 730)
115 opts.AddIgnoreZonesForPage(zones, 1)
116
117 ' C) Run OCR on the .pdf with options
118 OCRModule.ProcessPDF(doc, opts)
119
120 ' D) Check the result
121 doc.Save(output_path + "german_kids_song.pdf", SDFDoc.SaveOptions.e_remove_unused)
122
123 Console.WriteLine("Example 3: german_kids_song.pdf")
124
125 End Using
126 Catch e As PDFNetException
127 Console.WriteLine(e.Message)
128 End Try
129
130 '--------------------------------------------------------------------------------
131 ' Example 4) Process multipage tiff with text/ignore zones specified for each page
132 Try
133 ' A) Setup empty destination doc.
134 Using doc As PDFDoc = New PDFDoc()
135
136 ' B) Setup options with a single language plus text/ignore zones
137 Dim opts As OCROptions = New OCROptions()
138 If useOCR Then opts.SetOCREngine("iris")
139 opts.AddLang("eng")
140
141 Dim zones As RectCollection = New RectCollection()
142
143 ' ignore Signature box in the first 2 pages
144 zones.AddRect(1492, 56, 2236, 432)
145 opts.AddIgnoreZonesForPage(zones, 1)
146 zones.Clear()
147
148 zones.AddRect(1492, 56, 2236, 432)
149 opts.AddIgnoreZonesForPage(zones, 2)
150 zones.Clear()
151
152 ' can use a combination of ignore And text boxes to focus on the page area of interest,
153 ' as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
154 zones.AddRect(992, 1276, 1368, 1372)
155 opts.AddIgnoreZonesForPage(zones, 3)
156 zones.Clear()
157 ' we only have text zones selected in page 3
158
159
160 ' select horizontal BUFFER ZONE sign
161 zones.AddRect(900, 2384, 1236, 2480)
162 ' select right vertical BUFFER ZONE sign
163 zones.AddRect(1960, 1976, 2016, 2296)
164 ' select Lot No.
165 zones.AddRect(696, 1028, 1196, 1128)
166
167 ' select part of the plan inside the BUFFER ZONE
168 zones.AddRect(428, 1484, 1784, 2344)
169 zones.AddRect(948, 1288, 1672, 1476)
170 opts.AddIgnoreZonesForPage(zones, 3)
171
172 ' C) Run OCR on the .pdf with options
173 OCRModule.ImageToPDF(doc, input_path + "bc_environment_protection.tif", opts)
174
175 ' D) Check the result
176 doc.Save(output_path + "bc_environment_protection.pdf", SDFDoc.SaveOptions.e_remove_unused)
177
178 Console.WriteLine("Example 4: bc_environment_protection.tif")
179
180 End Using
181 Catch e As PDFNetException
182 Console.WriteLine(e.Message)
183 End Try
184
185 '--------------------------------------------------------------------------------
186 ' Example 5) Alternative workflow for extracting OCR result JSON, postprocessing (e.g., removing words Not in the dictionary Or filtering special
187 ' out special characters), And finally applying modified OCR JSON to the source PDF document
188 Try
189 ' A) Open the .pdf document.
190 Using doc As PDFDoc = New PDFDoc(input_path + "zero_value_test_no_text.pdf")
191
192 ' B) Set English as the language of choice
193 Dim opts As OCROptions = New OCROptions()
194 If useOCR Then opts.SetOCREngine("iris")
195 opts.AddLang("eng")
196
197 ' C) Run OCR on the .pdf
198 Dim json As String = OCRModule.GetOCRJsonFromPDF(doc, opts)
199
200 ' D) Post-processing step (whatever it might be), but we just print JSON here
201 Console.WriteLine("Have OCR result JSON, re-applying to PDF")
202
203 ' E) Apply potentially modified OCR JSON to the PDF
204 OCRModule.ApplyOCRJsonToPDF(doc, json)
205
206 ' F) Check the result
207 doc.Save(output_path + "zero_value_test_no_text.pdf", SDFDoc.SaveOptions.e_remove_unused)
208
209 Console.WriteLine("Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf")
210
211 End Using
212 Catch e As PDFNetException
213 Console.WriteLine(e.Message)
214 End Try
215
216 '--------------------------------------------------------------------------------
217 ' Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format, similar to the one used by TextExtractor
218 Try
219 ' A) Setup empty destination doc.
220 Using doc As PDFDoc = New PDFDoc()
221
222 ' B) Set English as the language of choice
223 Dim opts As OCROptions = New OCROptions()
224 If useOCR Then opts.SetOCREngine("iris")
225 opts.AddLang("eng")
226
227 ' C) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
228 ' in the process we convert the source image into PDF. We reuse this PDF document later to add hidden text layer to it.
229 Dim xml As String = OCRModule.GetOCRXmlFromImage(doc, input_path + "physics.tif", opts)
230
231 ' D) Post-processing step (whatever it might be), but we just print XML here
232 Console.WriteLine("Have OCR result XML, re-applying to PDF")
233
234 ' E) Apply potentially modified OCR XML to the PDF
235 OCRModule.ApplyOCRXmlToPDF(doc, xml)
236
237 ' F) Check the result
238 doc.Save(output_path + "physics.pdf", SDFDoc.SaveOptions.e_remove_unused)
239
240 Console.WriteLine("Example 6: extracting and applying OCR XML from physics.tif")
241
242 End Using
243 Catch e As PDFNetException
244 Console.WriteLine(e.Message)
245 End Try
246
247 PDFNet.Terminate()
248 End Sub
249
250End Module
Did you find this helpful?
Trial setup questions?
Ask experts on DiscordNeed other help?
Contact SupportPricing or product questions?
Contact Sales