Replace statistics.py

6cf7c8a7 · weiwei · 0dbe8ce2 · 6cf7c8a7
--- a/statistics.py
+++ b/statistics.py
 #python3
-import odf
-import sys
+import docx
+import os, zipfile, xml.dom.minidom, sys, getopt
 import requests, json
-from odf.namespaces import TEXTNS
-from odf.element import Element
-from odf.opendocument import load
-from odf import text,meta,office,draw

-# 加载报告文件
-doc=load(sys.argv[1])
+# 页数、字符数、表格数、图片数
+page_count = 0
+character_count = 0
+table_count = 0
+image_count = 0
+
 # API路径
 api_url = sys.argv[2] + "/api/v4/labreportstatistics?" + sys.argv[4] + "=" + sys.argv[5]
 # 项目id
@@ -16,32 +16,37 @@ project_id = sys.argv[3]
 # ci_job_token
 token = sys.argv[4]

-# 将需要的报告统计参数保存
-for stat in doc.getElementsByType(meta.DocumentStatistic):
-    # 取得报告页数、字数、表格数、图片数信息
-    page_count = stat.getAttribute('pagecount')
-    word_count = stat.getAttribute('wordcount')
-    table_count = stat.getAttribute('tablecount')
-    image_count = stat.getAttribute('imagecount')
-    # 还可以统计出报告中的以下内容
-    # stat.getAttribute('cellcount')
-    # 汉字或字符数量(含空格)stat.getAttribute('charactercount')
-    # stat.getAttribute('drawcount')
-    # stat.getAttribute('framecount')
-    # 汉字或字符数量(不含空格)stat.getAttribute('nonwhitespacecharactercount')
-    # stat.getAttribute('objectcount')
-    # 链接数量stat.getAttribute('oleobjectcount')
-    # 段落数量stat.getAttribute('paragraphcount')
-    # stat.getAttribute('rowcount')
-    # 句子数量stat.getAttribute('sentencecount')
-    # stat.getAttribute('syllablecount')
+# 取得docx文档的页数、字数和字符数的数据
+document = zipfile.ZipFile(sys.argv[1])
+dxm1 = document.read('docProps/app.xml')
+uglyxml = xml.dom.minidom.parseString(dxm1)
+page_count = uglyxml.getElementsByTagName('Pages')[0].childNodes[0].nodeValue
+# 这个字符数将一个汉字做为一个字符，将一个字母做为一个字符
+# 先取Characters如果没有，就用CharactersWithSpaces
+# 测试时如果用libreoffice打开docx文件后保存，选择”Use Microsoft Word 2007-2013 XML Format“后，图片可以正常统计数量，但是app.xml文件中只有CharactersWithSpaces参数，没有Characters参数
+find_character = uglyxml.getElementsByTagName('Characters')
+if not find_character:
+  character_count = uglyxml.getElementsByTagName('CharactersWithSpaces')[0].childNodes[0].nodeValue
+else:  
+  character_count = find_character[0].childNodes[0].nodeValue
+
+# 取得docx文档的表格数
+mydocx = docx.Document(sys.argv[1])
+table_count = len(mydocx.tables)

+# 取得docx文档的图片数
+# 通过调试查看target_ref，发现图片在文档中会设置一个形如imageX的默认名
+dict_rel = mydocx.part._rels
+for rel in dict_rel:
+  rel_info = dict_rel[rel]
+  if "image" in rel_info.target_ref:
+    image_count += 1

 # 准备向接口发送的参数
 data = {
    'project_id': project_id, 
    'page_count':page_count, 
-    'word_count':word_count, 
+    'word_count':character_count, 
    'table_count':table_count, 
    'image_count':image_count
    }