import java.io.*; import java.net.*; import java.util.*; import java.util.function.*; import java.util.logging.Level; import java.util.logging.Logger; /* Changelog 1/8/2017: - Improved capabilities of program to find obsolete EC entries and replace them with the new EC if available - If the new EC of an obsolete EC is already in the EC list, KEGG Miner will no longer include both entries preventing duplicate entries - KEGG Miner no longer includes EC entries representing one organism into the data file - Added a reporter system to the addition of ECs to the export file for debugging */ public class DataMiner { //Global Instantiated Variables private final String COMPARE_LIST = "BPI:UDI:SMUP:SUM:SMS:SMG:CHE:BRE:BBU:ZIN:NDL:TPQ:SSDC:HCI:LAS:BHE:BQU:APH:ECH:WPI:WCL:WOL:WRI:OTT:RPQ:RFE:RMC:RAM:FTT:PLR:PLY:CRP:BCI:CEY:CEA:ASY:YPG:MEN:RIP:HDE:BEN:BFL:BCC:BUC:WBR:SGL:PES:SCJ:SLL:SAPI:OHO:FTE:FIN:ZPR:ALM:PKO:FLI:CHU:SLR:SCD:SSM:SBU:AKA:TBD:CVI:NEU:CCS:ELI:PHL:OCO:HEL:HAA:SON:ECO:PFQ:MBJ:ACL:"; private DataMinerInterface DMI; private boolean includeInList; private ArrayList VitaminIDs; public DataMiner(DataMinerInterface DMI){ this.DMI = DMI; } //Advanced Methods public void mineKegg(File[] files) throws Exception{ PrintWriter writer2 = null; for (int i = 0; i < files.length; i++){ //Instatiates arraylists and wipes used arraylists ArrayList setAbbrv = new ArrayList(); ArrayList excel = new ArrayList(); ArrayList onesandzeros = new ArrayList(); VitaminIDs = new ArrayList(); ArrayList ECs = new ArrayList(); // String processedString = ""; for (int k = 0; k //Generates file reader FileReader file = new FileReader(files[i]); BufferedReader in = new BufferedReader(file); String inputLine; //Adds all ECs to be processed from text file String filename = files[i].toString(); //System.out.println(filename + "\n==============================="); DMI.addText("\n" + filename + "\n=======================================\n"); while ((inputLine = in.readLine()) != null){ inputLine = inputLine.replace("�", ""); //Removes anamolous characters from input text files inputLine = inputLine.replace(" ", ""); //System.out.println(inputLine); //Debugging EC printer DMI.addText(inputLine + "\n"); VitaminIDs.add(inputLine); } //System.out.println(VitaminIDs); //Debugging VitaminID arraylist additions in.close(); //Actual data mining process for(int j = 0; j setAbbrv, ArrayList onesandzeros){ //Generates temporary array list of collected data ArrayList collAbbrv = new ArrayList(); //Adds output to ArrayList String processedString = ""; for (int i = 0; i grabECsFromPDF(FileReader file, boolean isNAD) throws Exception{ ArrayList legitECs = new ArrayList(); BufferedReader in = new BufferedReader(file); String docInputLine; URL url; while ((docInputLine = in.readLine()) != null){ if (docInputLine.contains("(https://urldefense.proofpoint.com/v2/url?u=http-3A__www.enzyme-2Ddatabase.org_query.php-3Fec-3D&d=AwIGaQ&c=1QsCMERiq7JOmEnKpsSyjg&r=j9BjNmxXIwCVhVOItuh4qA&m=DKh9xIxhDhnzC8qOe7BJ_a7V6hroczkaCz_-G2-e5e0&s=L4qsGW7chShEN_HI9asJIC2Bq-IVA_9gTdnorch6JR4&e= ")){ url = new URL(docInputLine.substring(docInputLine.indexOf("http"),docInputLine.indexOf(")>>"))); if(webParser(url,isNAD)){ DMI.addText(docInputLine.substring(docInputLine.indexOf("ec=")+3,docInputLine.indexOf(")>>"))); DMI.addText("\n\n"); legitECs.add(docInputLine.substring(docInputLine.indexOf("ec=")+3,docInputLine.indexOf(")>>"))); } } } in.close(); return legitECs; } public ArrayList grabECsFromList() throws Exception{ ArrayList legitECs = new ArrayList(); FileReader file = new FileReader("C:\\Users\\Brian\\Documents\\NetBeansProjects\\Programming\\input.txt"); BufferedReader in = new BufferedReader(file); String docInputLine; URL url; //Parses through local document while ((docInputLine = in.readLine()) != null){ url = new URL("https://urldefense.proofpoint.com/v2/url?u=http-3A__www.enzyme-2Ddatabase.org_query.php-3Fec-3D&d=AwIGaQ&c=1QsCMERiq7JOmEnKpsSyjg&r=j9BjNmxXIwCVhVOItuh4qA&m=DKh9xIxhDhnzC8qOe7BJ_a7V6hroczkaCz_-G2-e5e0&s=L4qsGW7chShEN_HI9asJIC2Bq-IVA_9gTdnorch6JR4&e= "+docInputLine); if(webParser(url,true)){ DMI.addText(docInputLine + "\n"); legitECs.add(docInputLine); } } in.close(); return legitECs; } public ArrayList scanSpecialCases(File[] files, boolean isNAD) throws Exception{ ArrayList legitECs = new ArrayList(); for(File file : files){ FileReader f = new FileReader(file); BufferedReader in = new BufferedReader(f); String docInputLine; URL url; while ((docInputLine = in.readLine()) != null){ if (docInputLine.contains("(https://urldefense.proofpoint.com/v2/url?u=http-3A__www.enzyme-2Ddatabase.org_query.php-3Fec-3D&d=AwIGaQ&c=1QsCMERiq7JOmEnKpsSyjg&r=j9BjNmxXIwCVhVOItuh4qA&m=DKh9xIxhDhnzC8qOe7BJ_a7V6hroczkaCz_-G2-e5e0&s=L4qsGW7chShEN_HI9asJIC2Bq-IVA_9gTdnorch6JR4&e= ")){ url = new URL(docInputLine.substring(docInputLine.indexOf("http"),docInputLine.indexOf(")>>"))); if(webParserSpecialCases(url,isNAD)){ DMI.addText(docInputLine.substring(docInputLine.indexOf("ec=")+3,docInputLine.indexOf(")>>"))); DMI.addText("\n\n"); legitECs.add(docInputLine.substring(docInputLine.indexOf("ec=")+3,docInputLine.indexOf(")>>"))); } } } export(file.toString().substring(0,file.toString().indexOf(".pdf")) + " special cases output.txt",legitECs); in.close(); } DMI.getStartButton().setEnabled(true); return legitECs; } public void export(String filename, ArrayList excel){ try{ PrintWriter writer = new PrintWriter(filename, "UTF-8"); for (int i = 0; i wr.close(); return false; } //Reads through https://urldefense.proofpoint.com/v2/url?u=http-3A__www.enzyme-2Ddatabase.org_query.php-3Fec-3Dx.x.x.x&d=AwIGaQ&c=1QsCMERiq7JOmEnKpsSyjg&r=j9BjNmxXIwCVhVOItuh4qA&m=DKh9xIxhDhnzC8qOe7BJ_a7V6hroczkaCz_-G2-e5e0&s=V6fwekzSSYEc2Uvs3LN7egQmvlaJ42YNjmxpjvk9ClM&e= for hits of scripted target !!!Must be rewritten!!! private boolean webParserSpecialCases(URL url,boolean isNAD) throws Exception{ //Generates web reader boolean isParser = false; BufferedReader wr = new BufferedReader( new InputStreamReader(url.openStream())); String webInputLine; //Parses through web page and acquires data while ((webInputLine = wr.readLine()) != null){ if(isNAD){ if(NAD_Criterion(webInputLine,isParser)){ DMI.addText("Special Case:\n"); //Debug code for checking if claim to NAD dependence is legitimized by the presence of NADP as a reactent DMI.addText(webInputLine + "\n"); return true; } } else{ if(NADP_Criterion(webInputLine,isParser)){ DMI.addText("Special Case:\n"); //Debug code for checking if claim to NADP dependence is legitimized by the presence of NADP as a reactent DMI.addText(webInputLine + "\n"); return true; } } } // wr.close(); return false; } //Scans for special cases in ones ECs that would fail the normal criterions //Advanced Boolean Handlers private boolean NADP_Criterion(String webInputLine, boolean isParser){ if(isParser){ if(webInputLine.contains("Reaction:")&&((webInputLine.contains("NADP"))||(webInputLine.contains("NAD(P)")))){ return true; } if((webInputLine.contains("Comments:"))&&(webInputLine.contains("Requires NADP")||webInputLine.contains("Requires NAD(P)"))){ return true; } } else if (!isParser){ if(webInputLine.contains("Reaction:")&&((webInputLine.contains("NADP")||webInputLine.contains("NAD(P)")))){ return false; } if((webInputLine.contains("Comments:"))&&(webInputLine.contains("NADP")||webInputLine.contains("NAD(P)"))&&!(webInputLine.contains("Requires NADP")||webInputLine.contains("Requires NAD(P)"))) { return true; } } return false; } private boolean NAD_Criterion(String webInputLine, boolean isParser){ if(isParser){ if(webInputLine.contains("Reaction:")&&(webInputLine.contains("NAD"))&&(!webInputLine.contains("NADP"))){ return true; } if((webInputLine.contains("Comments:"))&&(webInputLine.contains("Requires NAD")) &&!(webInputLine.contains("Requires NADP"))){ return true; } } else if (!isParser) { if(webInputLine.contains("Reaction:")&&(webInputLine.contains("NAD"))&&(!webInputLine.contains("NADP"))){ return false; } if((webInputLine.contains("Comments:"))&&(webInputLine.contains("NAD"))&&!(webInputLine.contains("Requires NAD"))) { return true; } } return false; } }