The following script was written to find out which pages of a PDF file contain color. I used it for my thesis. Save it to a file and call it with script.py INPUTPDF.pdf
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
#!/usr/bin/env python import os.path import sys, getopt import subprocess import collections from pprint import pprint def main(argv): print "Welcome to PDFColorParser." print "This script will tell you all PDF pages which contain colors." print "This can be usefull if you need to a huge document in which some pages ar black and white and some not. " print "This script requires Ghostscript (gs) being installed on your system." # check if the user passed a file name as argument if len(sys.argv) > 1: input_file = str(sys.argv[1]) if os.path.isfile(input_file): # use ghostscript to find pages containing color stuff # https://tex.stackexchange.com/questions/53493/detecting-all-pages-which-contain-color/61216#61216 # If the CMY values are not 0 then the page is color. # e.g. page one is bw, page two has colours #Page 1 #0.00000 0.00000 0.00000 0.02230 CMYK OK #Page 2 #0.02360 0.02360 0.02360 0.02360 CMYK OK bashCommand = "gs -o - -sDEVICE=inkcov " + input_file process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) output, error = process.communicate() if error != None: print "There was an error parsing the input file " + input_file sys.exit(2) # when there is no error - process the generated output result = collections.OrderedDict() lastPage = '' for line in output.splitlines(): line = line.strip() if line.startswith("Page"): lastPage = line elif line.find("CMYK") != -1: # when cvalue, mvalue and kvalue only contain 0 and dots the page is black and white for value in line.split()[:2]: value = value.replace("0","").replace(".","").strip() if len(value) > 0: result[lastPage] = "Color" else: result[lastPage] = "BlackAndWhite" # print all pages which contain color colorpages = "" colorpagescounter = 0 for key, value in result.iteritems(): if value == "Color": colorpages = colorpages + key.replace("Page ","") + ", " colorpagescounter += 1 print "The following " + str(colorpagescounter) + " pages of the total " + str(len(result)) + " pages are in color:" print(colorpages) else: print str(input_file) + " is not a valid file." else: print 'USAGE: script.py <input.pdf>' sys.exit(2) if __name__ == "__main__": main(sys.argv[1:]) |