Python 文档转换方案
分类-Python
二月 24, 2011 22:02
1009 Views 0 Comments
Python 文档转换解决方案
- 文中所用平台为Linux
- 将 Office/PDF 转换为txt是为了进行全文索引
- 提取PDF首页缩略图进行预览
- 将 Office/PDF/Txt 转换为SWF是为了可以在线浏览
Office -> PDF
- 支持 doc/xls/ppt 等多种格式
- docx 等也支持
- 不能并行转换
可以使用OpenOffice 的API进行转换
1. 启动 OpenOffice 进程
$ soffice "-accept=socket,host=localhost,port=8100;urp;StarOffice.ServiceManager" -nologo -headless -nofirststartwizard
2. 安装pyuno
3. 代码(从网上找的代码,测试可用,另外有个DocumentConvert封装好的类http://www.artofsolving.com/opensource/pyodconverter
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/usr/bin/env python
import uno
from unohelper import Base, systemPathToFileUrl, absolutize
from com.sun.star.beans import PropertyValue
def convert_office_to_pdf(source_file, dst_file):
url = systemPathToFileUrl(source_file)
url_save = systemPathToFileUrl(dst_file)
### Get Service Manager
context = uno.getComponentContext()
resolver = context.ServiceManager.createInstanceWithContext("com.sun.star.bridge.UnoUrlResolver", context)
ctx = resolver.resolve("uno:socket,host=localhost,port=2002;urp;StarOffice.ComponentContext")
smgr = ctx.ServiceManager
### Load document
properties = []
p = PropertyValue()
p.Name = "Hidden"
p.Value = True
properties.append(p)
properties = tuple(properties)
desktop = smgr.createInstanceWithContext("com.sun.star.frame.Desktop", ctx)
doc = desktop.loadComponentFromURL(url, "_blank", 0, properties)
### Save File
properties = []
p = PropertyValue()
p.Name = "Overwrite"
p.Value = True
properties.append(p)
p = PropertyValue()
p.Name = "FilterName"
p.Value = 'writer_pdf_Export'
properties.append(p)
properties = tuple(properties)
doc.storeToURL(url_save, properties)
doc.dispose()
TXT -> PDF
可以使用 Office -> PDF 的方法转换
PDF -> TXT
将PDF转换成TXT主要是为了进行全文索引
可以用第三方工具 pdftotext 进行转换
1. 安装pdftotext sudo apt-get install pdftotext
2. 用subprocess 调用pdftotext 进行转换
1
2
3
4
5
6
cmd = "pdftotext %s %s" % (source_file, dst_file)
popen = subprocess.Popen(
args=cmd.split(' '),
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
PDF -> SWF
可以使用 swftools 中的 pdf2swf 进行转换
1. 安装 swftools 2. 用 subprocess 调用swftools命令
1
2
3
4
5
6
cmd = "pdf2swf -T9 %s -o %s" % (source_file, dst_file)
popen = subprocess.Popen(
args=cmd.split(' '),
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
Office --> SWF
可以 Office -> PDF -> SWF
TXT --> SWF
可以 TXT -> PDF -> SWF
提取PDF缩略图
- 安装 gfx ..http://www.swftools.org/gfx_tutorial.html
1
2
3
4
5
6
7
8
9
10
pdf_doc = ux.open("pdf", tmp_pdf_file)
img = gfx.ImageList()
img.setparameter("antialise", "1") # turn on antialising
page1 = pdf_doc.getPage(1)
# 注意这个大小是生成图片的大小,要和page长宽保持一直
# 若小于原图片尺寸,图片会被裁剪,不会自动缩放图片
img.startpage(page1.width, page1.height)
page1.render(img)
img.endpage()
img.save(tmp_img_file)
常见问题
subprocess 超时
在转换大的文档时或者其它有问题的文档时进程可能会死掉导致转换队列卡住,可以设置个超时时间,超时后自动杀死转换进程
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
class Process():
@classmethod
def run(cls, cmd, timeout=0):
assert type(timeout) == int
#print "+ process cmd: %s" % cmd
cmd = cmd.split(' ')
#print "[INFO] process args: ", cmd
popen = subprocess.Popen(args=cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
if timeout != 0:
start_time = time.time()
while popen.poll() == None and (time.time() - start_time ) < timeout:
time.sleep(0.5)
else:
popen.wait()
if popen.poll() == None:
#print "+ kill pid: %-10d" % popen.pid
os.kill(popen.pid, signal.SIGTERM)
os.kill(popen.pid, signal.SIGKILL)
popen.wait()
ret = popen.poll()
#print "+ process ret: %-10d" % ret
if ret != 0 :
print "[INFO] process stdout:\n%s" % popen.stdout.read()
print "[INFO] process stderr:\n%s" % popen.stderr.read()
return ret
Process.run("pdf2swf -T9 %s -o %s" % (source_file, dst_file), 5*60)
pdftotext 乱码
- 安装libpoppler7
pdf2swf 乱码
- 安装 xpdf-utils
- 安装 xpdf-chinese-simplified
- 根据 xpdf-chinese-simplified 的说明进行配置(或者Google之)
- 在pdf2swf 转换时添加参数 -slanguaredir="path_to_xpdf_chinese_simplified"
TXT 乱码
1. 转换为PDF时乱码 2. 最终生成的SWF乱码
解决方法: 检查一下所有TXT文件的编码,如果不是UTF-8就转换成UTF-8然后再进行其它转换 可以参考我上篇有关TXT编码的文件
$ soffice "-accept=socket,host=localhost,port=8100;urp;StarOffice.ServiceManager" -nologo -headless -nofirststartwizard|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
|
#!/usr/bin/env pythonimport unofrom unohelper import Base, systemPathToFileUrl, absolutizefrom com.sun.star.beans import PropertyValuedef convert_office_to_pdf(source_file, dst_file): url = systemPathToFileUrl(source_file) url_save = systemPathToFileUrl(dst_file) ### Get Service Manager context = uno.getComponentContext() resolver = context.ServiceManager.createInstanceWithContext("com.sun.star.bridge.UnoUrlResolver", context) ctx = resolver.resolve("uno:socket,host=localhost,port=2002;urp;StarOffice.ComponentContext") smgr = ctx.ServiceManager ### Load document properties = [] p = PropertyValue() p.Name = "Hidden" p.Value = True properties.append(p) properties = tuple(properties) desktop = smgr.createInstanceWithContext("com.sun.star.frame.Desktop", ctx) doc = desktop.loadComponentFromURL(url, "_blank", 0, properties) ### Save File properties = [] p = PropertyValue() p.Name = "Overwrite" p.Value = True properties.append(p) p = PropertyValue() p.Name = "FilterName" p.Value = 'writer_pdf_Export' properties.append(p) properties = tuple(properties) doc.storeToURL(url_save, properties) doc.dispose() |
|
1
2
3
4
5
6
|
cmd = "pdftotext %s %s" % (source_file, dst_file)popen = subprocess.Popen( args=cmd.split(' '), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
|
1
2
3
4
5
6
|
cmd = "pdf2swf -T9 %s -o %s" % (source_file, dst_file)popen = subprocess.Popen( args=cmd.split(' '), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
|
1
2
3
4
5
6
7
8
9
10
|
pdf_doc = ux.open("pdf", tmp_pdf_file)img = gfx.ImageList()img.setparameter("antialise", "1") # turn on antialisingpage1 = pdf_doc.getPage(1)# 注意这个大小是生成图片的大小,要和page长宽保持一直# 若小于原图片尺寸,图片会被裁剪,不会自动缩放图片img.startpage(page1.width, page1.height)page1.render(img)img.endpage()img.save(tmp_img_file) |
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
|
class Process(): @classmethod def run(cls, cmd, timeout=0): assert type(timeout) == int #print "+ process cmd: %s" % cmd cmd = cmd.split(' ') #print "[INFO] process args: ", cmd popen = subprocess.Popen(args=cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if timeout != 0: start_time = time.time() while popen.poll() == None and (time.time() - start_time ) < timeout: time.sleep(0.5) else: popen.wait() if popen.poll() == None: #print "+ kill pid: %-10d" % popen.pid os.kill(popen.pid, signal.SIGTERM) os.kill(popen.pid, signal.SIGKILL) popen.wait() ret = popen.poll() #print "+ process ret: %-10d" % ret if ret != 0 : print "[INFO] process stdout:\n%s" % popen.stdout.read() print "[INFO] process stderr:\n%s" % popen.stderr.read() return retProcess.run("pdf2swf -T9 %s -o %s" % (source_file, dst_file), 5*60) |