Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et: 

 

# Copyright 2014-2018 Florian Bruhin (The Compiler) <mail@qutebrowser.org> 

# 

# This file is part of qutebrowser. 

# 

# qutebrowser is free software: you can redistribute it and/or modify 

# it under the terms of the GNU General Public License as published by 

# the Free Software Foundation, either version 3 of the License, or 

# (at your option) any later version. 

# 

# qutebrowser is distributed in the hope that it will be useful, 

# but WITHOUT ANY WARRANTY; without even the implied warranty of 

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

# GNU General Public License for more details. 

# 

# You should have received a copy of the GNU General Public License 

# along with qutebrowser. If not, see <http://www.gnu.org/licenses/>. 

 

"""pyPEG parsing for the RFC 6266 (Content-Disposition) header.""" 

 

import urllib.parse 

import string 

import re 

 

import attr 

import pypeg2 as peg 

 

from qutebrowser.utils import utils 

 

 

class UniqueNamespace(peg.Namespace): 

 

"""A pyPEG2 namespace which prevents setting a value twice.""" 

 

def __setitem__(self, key, value): 

if key in self: 

raise DuplicateParamError(key) 

super().__setitem__(key, value) 

 

 

# RFC 2616 

ctl_chars = ''.join(chr(i) for i in range(32)) + chr(127) 

 

 

# RFC 5987 

attr_chars_nonalnum = '!#$&+-.^_`|~' 

attr_chars = string.ascii_letters + string.digits + attr_chars_nonalnum 

 

 

# RFC 5987 gives this alternative construction of the token character class 

token_chars = attr_chars + "*'%" # flake8: disable=S001 

 

 

# Definitions from https://tools.ietf.org/html/rfc2616#section-2.2 

# token was redefined from attr_chars to avoid using AnyBut, 

# which might include non-ascii octets. 

token_re = '[{}]+'.format(re.escape(token_chars)) 

 

 

class Token(str): 

 

"""A token (RFC 2616, Section 2.2).""" 

 

grammar = re.compile(token_re) 

 

 

# RFC 2616 says some linear whitespace (LWS) is in fact allowed in text 

# and qdtext; however it also mentions folding that whitespace into 

# a single SP (which isn't in CTL) before interpretation. 

# Assume the caller already that folding when parsing headers. 

 

# NOTE: qdtext also allows non-ascii, which we choose to parse 

# as ISO-8859-1; rejecting it entirely would also be permitted. 

# Some broken browsers attempt encoding-sniffing, which is broken 

# because the spec only allows iso, and because encoding-sniffing 

# can mangle valid values. 

# Everything else in this grammar (including RFC 5987 ext values) 

# is in an ascii-safe encoding. 

 

qdtext_re = r'[^"{}]'.format(re.escape(ctl_chars)) 

quoted_pair_re = r'\\[{}]'.format(re.escape( 

''.join(chr(i) for i in range(128)))) 

 

 

class QuotedString(str): 

 

"""A quoted string (RFC 2616, Section 2.2).""" 

 

grammar = re.compile(r'"({}|{})+"'.format(quoted_pair_re, qdtext_re)) 

 

def __str__(self): 

s = super().__str__() 

s = s[1:-1] # remove quotes 

s = re.sub(r'\\(.)', r'\1', s) # drop backslashes 

return s 

 

 

class Value(str): 

 

"""A value. (RFC 2616, Section 3.6).""" 

 

grammar = [re.compile(token_re), QuotedString] 

 

 

class Charset(str): 

 

"""A charset (RFC5987, Section 3.2.1).""" 

 

# Other charsets are forbidden, the spec reserves them 

# for future evolutions. 

grammar = re.compile('UTF-8|ISO-8859-1', re.I) 

 

 

class Language(str): 

 

"""A language-tag (RFC 5646, Section 2.1). 

 

FIXME: This grammar is not 100% correct yet. 

https://github.com/qutebrowser/qutebrowser/issues/105 

""" 

 

grammar = re.compile('[A-Za-z0-9-]+') 

 

 

attr_char_re = '[{}]'.format(re.escape(attr_chars)) 

hex_digit_re = '%[' + string.hexdigits + ']{2}' 

 

 

class ValueChars(str): 

 

"""A value of an attribute. 

 

FIXME: Can we merge this with Value? 

https://github.com/qutebrowser/qutebrowser/issues/105 

""" 

 

grammar = re.compile('({}|{})*'.format(attr_char_re, hex_digit_re)) 

 

 

class ExtValue(peg.List): 

 

"""An ext-value of an attribute (RFC 5987, Section 3.2).""" 

 

grammar = peg.contiguous(Charset, "'", peg.optional(Language), "'", 

ValueChars) 

 

 

class ExtToken(peg.Symbol): 

 

"""A token introducing an extended value (RFC 6266, Section 4.1).""" 

 

regex = re.compile(token_re + r'\*') 

 

def __str__(self): 

return super().__str__().lower() 

 

 

class NoExtToken(peg.Symbol): 

 

"""A token introducing a normal value (RFC 6266, Section 4.1).""" 

 

regex = re.compile(token_re + r'(?<!\*)') 

 

def __str__(self): 

return super().__str__().lower() 

 

 

class DispositionParm(str): 

 

"""A parameter for the Disposition-Type header (RFC6266, Section 4.1).""" 

 

grammar = peg.attr('name', NoExtToken), '=', Value 

 

 

class ExtDispositionParm: 

 

"""An extended parameter (RFC6266, Section 4.1).""" 

 

grammar = peg.attr('name', ExtToken), '=', ExtValue 

 

def __init__(self, value, name=None): 

self.name = name 

self.value = value 

 

 

class DispositionType(peg.List): 

 

"""The disposition type (RFC6266, Section 4.1).""" 

 

grammar = [re.compile('(inline|attachment)', re.I), Token] 

 

 

class DispositionParmList(UniqueNamespace): 

 

"""A list of disposition parameters (RFC6266, Section 4.1).""" 

 

grammar = peg.maybe_some(';', [ExtDispositionParm, DispositionParm]) 

 

 

class ContentDispositionValue: 

 

"""A complete Content-Disposition value (RFC 6266, Section 4.1).""" 

 

# Allows nonconformant final semicolon 

# I've seen it in the wild, and browsers accept it 

# http://greenbytes.de/tech/tc2231/#attwithasciifilenamenqs 

grammar = (peg.attr('dtype', DispositionType), 

peg.attr('params', DispositionParmList), 

peg.optional(';')) 

 

 

@attr.s 

class LangTagged: 

 

"""A string with an associated language.""" 

 

string = attr.ib() 

langtag = attr.ib() 

 

 

class Error(Exception): 

 

"""Base class for RFC6266 errors.""" 

 

 

class DuplicateParamError(Error): 

 

"""Exception raised when a parameter has been given twice.""" 

 

 

class InvalidISO8859Error(Error): 

 

"""Exception raised when a byte is invalid in ISO-8859-1.""" 

 

 

class _ContentDisposition: 

 

"""Records various indications and hints about content disposition. 

 

These can be used to know if a file should be downloaded or 

displayed directly, and to hint what filename it should have 

in the download case. 

""" 

 

def __init__(self, disposition, assocs): 

"""Used internally after parsing the header.""" 

assert len(disposition) == 1 

self.disposition = disposition[0] 

self.assocs = dict(assocs) # So we can change values 

if 'filename*' in self.assocs: 

param = self.assocs['filename*'] 

assert isinstance(param, ExtDispositionParm) 

self.assocs['filename*'] = parse_ext_value(param.value).string 

 

def filename(self): 

"""The filename from the Content-Disposition header or None. 

 

On safety: 

 

This property records the intent of the sender. 

 

You shouldn't use this sender-controlled value as a filesystem path, it 

can be insecure. Serving files with this filename can be dangerous as 

well, due to a certain browser using the part after the dot for 

mime-sniffing. Saving it to a database is fine by itself though. 

""" 

if 'filename*' in self.assocs: 

return self.assocs['filename*'] 

elif 'filename' in self.assocs: 

# XXX Reject non-ascii (parsed via qdtext) here? 

return self.assocs['filename'] 

return None 

 

def is_inline(self): 

"""Return if the file should be handled inline. 

 

If not, and unless your application supports other dispositions 

than the standard inline and attachment, it should be handled 

as an attachment. 

""" 

return self.disposition.lower() == 'inline' 

 

def __repr__(self): 

return utils.get_repr(self, constructor=True, 

disposition=self.disposition, assocs=self.assocs) 

 

 

def normalize_ws(text): 

"""Do LWS (linear whitespace) folding.""" 

return ' '.join(text.split()) 

 

 

def parse_headers(content_disposition): 

"""Build a _ContentDisposition from header values.""" 

# We allow non-ascii here (it will only be parsed inside of qdtext, and 

# rejected by the grammar if it appears in other places), although parsing 

# it can be ambiguous. Parsing it ensures that a non-ambiguous filename* 

# value won't get dismissed because of an unrelated ambiguity in the 

# filename parameter. But it does mean we occasionally give 

# less-than-certain values for some legacy senders. 

content_disposition = content_disposition.decode('iso-8859-1') 

# Our parsing is relaxed in these regards: 

# - The grammar allows a final ';' in the header; 

# - We do LWS-folding, and possibly normalise other broken 

# whitespace, instead of rejecting non-lws-safe text. 

# XXX Would prefer to accept only the quoted whitespace 

# case, rather than normalising everything. 

content_disposition = normalize_ws(content_disposition) 

parsed = peg.parse(content_disposition, ContentDispositionValue) 

return _ContentDisposition(disposition=parsed.dtype, assocs=parsed.params) 

 

 

def parse_ext_value(val): 

"""Parse the value of an extended attribute.""" 

if len(val) == 3: 

charset, langtag, coded = val 

else: 

charset, coded = val 

langtag = None 

decoded = urllib.parse.unquote(coded, charset, errors='strict') 

if charset == 'iso-8859-1': 

# Fail if the filename contains an invalid ISO-8859-1 char 

for c in decoded: 

if 0x7F <= ord(c) <= 0x9F: 

raise InvalidISO8859Error(c) 

return LangTagged(decoded, langtag)